Skip to content

Commit 82f86e8

Browse files
author
msftbot[bot]
authored
Performance improvement in Count<T> extension (#3548)
## PR Type What kind of change does this PR introduce? <!-- Please uncomment one or more that apply to this PR. --> - Performance improvement <!-- - Bugfix --> <!-- - Feature --> <!-- - Code style update (formatting) --> <!-- - Refactoring (no functional changes, no api changes) --> <!-- - Build or CI related changes --> <!-- - Documentation content changes --> <!-- - Sample app changes --> <!-- - Other... Please describe: --> ## What is the new behavior? <!-- Describe how was this issue resolved or changed? --> About 20% improvement on .NET 5 when working on `char` types (or larger): ![image](https://user-images.githubusercontent.com/10199417/97509236-ff526e80-1981-11eb-8a90-f8aa72f1551e.png) This was done by adding an unrolled loop for the vectorized path of the SIMD accelerated version of `Count<T>`. ## PR Checklist Please check if your PR fulfills the following requirements: - [X] Tested code with current [supported SDKs](../readme.md#supported) - [ ] ~~Pull Request has been submitted to the documentation repository [instructions](..\contributing.md#docs). Link: <!-- docs PR link -->~~ - [ ] ~~Sample in sample app has been added / updated (for bug fixes / features)~~ - [ ] ~~Icon has been created (if new sample) following the [Thumbnail Style Guide and templates](https://github.com/windows-toolkit/WindowsCommunityToolkit-design-assets)~~ - [X] Tests for the changes have been added (for bug fixes / features) (if applicable) - [X] Header has been added to all new source files (run *build/UpdateHeaders.bat*) - [X] Contains **NO** breaking changes
2 parents 5129d2c + 6c4888d commit 82f86e8

File tree

2 files changed

+65
-22
lines changed

2 files changed

+65
-22
lines changed

Microsoft.Toolkit.HighPerformance/Helpers/Internals/SpanHelper.Count.cs

Lines changed: 65 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -79,9 +79,6 @@ public static nint Count<T>(ref T r0, nint length, T value)
7979
/// Implements <see cref="Count{T}"/> with a sequential search.
8080
/// </summary>
8181
[Pure]
82-
#if NETCOREAPP3_1
83-
[MethodImpl(MethodImplOptions.AggressiveOptimization)]
84-
#endif
8582
private static nint CountSequential<T>(ref T r0, nint length, T value)
8683
where T : IEquatable<T>
8784
{
@@ -132,9 +129,6 @@ private static nint CountSequential<T>(ref T r0, nint length, T value)
132129
/// Implements <see cref="Count{T}"/> with a vectorized search.
133130
/// </summary>
134131
[Pure]
135-
#if NETCOREAPP3_1
136-
[MethodImpl(MethodImplOptions.AggressiveOptimization)]
137-
#endif
138132
private static nint CountSimd<T>(ref T r0, nint length, T value)
139133
where T : unmanaged, IEquatable<T>
140134
{
@@ -161,6 +155,67 @@ private static nint CountSimd<T>(ref T r0, nint length, T value)
161155

162156
var partials = Vector<T>.Zero;
163157

158+
// Unrolled vectorized loop, with 8 unrolled iterations. We only run this when the
159+
// current type T is at least 2 bytes in size, otherwise the average chunk length
160+
// would always be too small to be able to trigger the unrolled loop, and the overall
161+
// performance would just be slightly worse due to the additional conditional branches.
162+
if (typeof(T) != typeof(sbyte))
163+
{
164+
while (chunkLength >= Vector<T>.Count * 8)
165+
{
166+
ref T ri0 = ref Unsafe.Add(ref r0, offset + (Vector<T>.Count * 0));
167+
var vi0 = Unsafe.As<T, Vector<T>>(ref ri0);
168+
var ve0 = Vector.Equals(vi0, vc);
169+
170+
partials -= ve0;
171+
172+
ref T ri1 = ref Unsafe.Add(ref r0, offset + (Vector<T>.Count * 1));
173+
var vi1 = Unsafe.As<T, Vector<T>>(ref ri1);
174+
var ve1 = Vector.Equals(vi1, vc);
175+
176+
partials -= ve1;
177+
178+
ref T ri2 = ref Unsafe.Add(ref r0, offset + (Vector<T>.Count * 2));
179+
var vi2 = Unsafe.As<T, Vector<T>>(ref ri2);
180+
var ve2 = Vector.Equals(vi2, vc);
181+
182+
partials -= ve2;
183+
184+
ref T ri3 = ref Unsafe.Add(ref r0, offset + (Vector<T>.Count * 3));
185+
var vi3 = Unsafe.As<T, Vector<T>>(ref ri3);
186+
var ve3 = Vector.Equals(vi3, vc);
187+
188+
partials -= ve3;
189+
190+
ref T ri4 = ref Unsafe.Add(ref r0, offset + (Vector<T>.Count * 4));
191+
var vi4 = Unsafe.As<T, Vector<T>>(ref ri4);
192+
var ve4 = Vector.Equals(vi4, vc);
193+
194+
partials -= ve4;
195+
196+
ref T ri5 = ref Unsafe.Add(ref r0, offset + (Vector<T>.Count * 5));
197+
var vi5 = Unsafe.As<T, Vector<T>>(ref ri5);
198+
var ve5 = Vector.Equals(vi5, vc);
199+
200+
partials -= ve5;
201+
202+
ref T ri6 = ref Unsafe.Add(ref r0, offset + (Vector<T>.Count * 6));
203+
var vi6 = Unsafe.As<T, Vector<T>>(ref ri6);
204+
var ve6 = Vector.Equals(vi6, vc);
205+
206+
partials -= ve6;
207+
208+
ref T ri7 = ref Unsafe.Add(ref r0, offset + (Vector<T>.Count * 7));
209+
var vi7 = Unsafe.As<T, Vector<T>>(ref ri7);
210+
var ve7 = Vector.Equals(vi7, vc);
211+
212+
partials -= ve7;
213+
214+
chunkLength -= Vector<T>.Count * 8;
215+
offset += Vector<T>.Count * 8;
216+
}
217+
}
218+
164219
while (chunkLength >= Vector<T>.Count)
165220
{
166221
ref T ri = ref Unsafe.Add(ref r0, offset);
@@ -242,28 +297,22 @@ private static nint CountSimd<T>(ref T r0, nint length, T value)
242297
private static unsafe nint GetUpperBound<T>()
243298
where T : unmanaged
244299
{
245-
if (typeof(T) == typeof(byte) ||
246-
typeof(T) == typeof(sbyte) ||
247-
typeof(T) == typeof(bool))
300+
if (typeof(T) == typeof(sbyte))
248301
{
249302
return sbyte.MaxValue;
250303
}
251304

252-
if (typeof(T) == typeof(char) ||
253-
typeof(T) == typeof(ushort) ||
254-
typeof(T) == typeof(short))
305+
if (typeof(T) == typeof(short))
255306
{
256307
return short.MaxValue;
257308
}
258309

259-
if (typeof(T) == typeof(int) ||
260-
typeof(T) == typeof(uint))
310+
if (typeof(T) == typeof(int))
261311
{
262312
return int.MaxValue;
263313
}
264314

265-
if (typeof(T) == typeof(long) ||
266-
typeof(T) == typeof(ulong))
315+
if (typeof(T) == typeof(long))
267316
{
268317
if (sizeof(nint) == sizeof(int))
269318
{

Microsoft.Toolkit.HighPerformance/Helpers/Internals/SpanHelper.Hash.cs

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,6 @@ internal static partial class SpanHelper
2121
/// <param name="length">The number of items to hash.</param>
2222
/// <returns>The Djb2 value for the input sequence of items.</returns>
2323
[Pure]
24-
#if NETCOREAPP3_1
25-
[MethodImpl(MethodImplOptions.AggressiveOptimization)]
26-
#endif
2724
public static int GetDjb2HashCode<T>(ref T r0, nint length)
2825
where T : notnull
2926
{
@@ -87,9 +84,6 @@ public static int GetDjb2HashCode<T>(ref T r0, nint length)
8784
/// faster than <see cref="GetDjb2HashCode{T}"/>, as it can parallelize much of the workload.
8885
/// </remarks>
8986
[Pure]
90-
#if NETCOREAPP3_1
91-
[MethodImpl(MethodImplOptions.AggressiveOptimization)]
92-
#endif
9387
public static unsafe int GetDjb2LikeByteHash(ref byte r0, nint length)
9488
{
9589
int hash = 5381;

0 commit comments

Comments
 (0)