Skip to content

Commit 3627073

Browse files
Update based on feedback
1 parent 62a0666 commit 3627073

File tree

6 files changed

+125
-217
lines changed

6 files changed

+125
-217
lines changed

src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,11 @@ private static void Shuffle4(
375375
}
376376
else if (Vector256.IsHardwareAccelerated)
377377
{
378+
// ShufflePerLane performs per-128-bit-lane shuffling using Avx2.Shuffle (vpshufb).
379+
// MMShuffleSpan generates indices in the range [0, 31] and never sets bit 7 in any byte,
380+
// so the shuffle will not zero elements. Because vpshufb uses only the low 4 bits (b[i] & 0x0F)
381+
// for indexing within each lane, and ignores the upper bits unless bit 7 is set,
382+
// this usage is guaranteed to remain within-lane and non-zeroing.
378383
Span<byte> temp = stackalloc byte[Vector256<byte>.Count];
379384
Shuffle.MMShuffleSpan(ref temp, control);
380385
Vector256<byte> mask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(temp));
@@ -391,17 +396,17 @@ private static void Shuffle4(
391396
ref Vector256<byte> vs0 = ref Unsafe.Add(ref sourceBase, i);
392397
ref Vector256<byte> vd0 = ref Unsafe.Add(ref destinationBase, i);
393398

394-
vd0 = Vector256_.ShuffleNative(vs0, mask);
395-
Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), mask);
396-
Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), mask);
397-
Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), mask);
399+
vd0 = Vector256_.ShufflePerLane(vs0, mask);
400+
Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)1), mask);
401+
Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)2), mask);
402+
Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)3), mask);
398403
}
399404

400405
if (m > 0)
401406
{
402407
for (nuint i = u; i < n; i++)
403408
{
404-
Unsafe.Add(ref destinationBase, i) = Vector256_.ShuffleNative(Unsafe.Add(ref sourceBase, i), mask);
409+
Unsafe.Add(ref destinationBase, i) = Vector256_.ShufflePerLane(Unsafe.Add(ref sourceBase, i), mask);
405410
}
406411
}
407412
}

src/ImageSharp/Common/Helpers/Vector128Utilities.cs

Lines changed: 96 additions & 180 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,10 @@ public static Vector128<byte> Average(Vector128<byte> left, Vector128<byte> righ
4747
return AdvSimd.FusedAddRoundedHalving(left, right);
4848
}
4949

50-
// Portable fallback: (a + b + 1) >> 1
51-
return (left + right + Vector128.Create((byte)1)) >> 1;
50+
// Account for potential 9th bit to ensure correct rounded result.
51+
return Vector128.Narrow(
52+
(Vector128.WidenLower(left) + Vector128.WidenLower(right) + Vector128<ushort>.One) >> 1,
53+
(Vector128.WidenUpper(left) + Vector128.WidenUpper(right) + Vector128<ushort>.One) >> 1);
5254
}
5355

5456
/// <summary>
@@ -117,13 +119,17 @@ public static Vector128<short> ShuffleHigh(Vector128<short> value, [ConstantExpe
117119
}
118120

119121
// Don't use InverseMMShuffle here as we want to avoid the cast.
120-
Vector64<short> indices = Vector64.Create(
121-
(short)(control & 0x3),
122-
(short)((control >> 2) & 0x3),
123-
(short)((control >> 4) & 0x3),
124-
(short)((control >> 6) & 0x3));
125-
126-
return Vector128.Create(value.GetLower(), Vector64.Shuffle(value.GetUpper(), indices));
122+
Vector128<short> indices = Vector128.Create(
123+
0,
124+
1,
125+
2,
126+
3,
127+
(short)((control & 0x3) + 4),
128+
(short)(((control >> 2) & 0x3) + 4),
129+
(short)(((control >> 4) & 0x3) + 4),
130+
(short)(((control >> 6) & 0x3) + 4));
131+
132+
return Vector128.Shuffle(value, indices);
127133
}
128134

129135
/// <summary>
@@ -144,13 +150,17 @@ public static Vector128<short> ShuffleLow(Vector128<short> value, [ConstantExpec
144150
}
145151

146152
// Don't use InverseMMShuffle here as we want to avoid the cast.
147-
Vector64<short> indices = Vector64.Create(
148-
(short)(control & 0x3),
149-
(short)((control >> 2) & 0x3),
150-
(short)((control >> 4) & 0x3),
151-
(short)((control >> 6) & 0x3));
152-
153-
return Vector128.Create(Vector64.Shuffle(value.GetLower(), indices), value.GetUpper());
153+
Vector128<short> indices = Vector128.Create(
154+
(short)(control & 0x3),
155+
(short)((control >> 2) & 0x3),
156+
(short)((control >> 4) & 0x3),
157+
(short)((control >> 6) & 0x3),
158+
4,
159+
5,
160+
6,
161+
7);
162+
163+
return Vector128.Shuffle(value, indices);
154164
}
155165

156166
/// <summary>
@@ -237,28 +247,13 @@ public static Vector128<byte> ShiftLeftBytesInVector(Vector128<byte> value, [Con
237247
[MethodImpl(MethodImplOptions.AggressiveInlining)]
238248
public static Vector128<short> ShiftLeftLogical(Vector128<short> value, [ConstantExpected] byte count)
239249
{
240-
if (Sse2.IsSupported)
241-
{
242-
return Sse2.ShiftLeftLogical(value, count);
243-
}
244-
245250
// Zero lanes where count >= 16 to match SSE2
246251
if (count >= 16)
247252
{
248253
return Vector128<short>.Zero;
249254
}
250255

251-
if (AdvSimd.IsSupported)
252-
{
253-
return AdvSimd.ShiftLogical(value, Vector128.Create((short)count));
254-
}
255-
256-
if (PackedSimd.IsSupported)
257-
{
258-
return PackedSimd.ShiftLeft(value, count);
259-
}
260-
261-
return Vector128.ShiftLeft(value, count);
256+
return value << count;
262257
}
263258

264259
/// <summary>
@@ -536,6 +531,11 @@ public static Vector128<int> MultiplyAddAdjacent(Vector128<short> left, Vector12
536531
Vector128<int> prodLo = AdvSimd.MultiplyWideningLower(left.GetLower(), right.GetLower());
537532
Vector128<int> prodHi = AdvSimd.MultiplyWideningLower(left.GetUpper(), right.GetUpper());
538533

534+
if (AdvSimd.Arm64.IsSupported)
535+
{
536+
return AdvSimd.Arm64.AddPairwise(prodLo, prodHi);
537+
}
538+
539539
Vector128<long> v0 = AdvSimd.AddPairwiseWidening(prodLo);
540540
Vector128<long> v1 = AdvSimd.AddPairwiseWidening(prodHi);
541541

@@ -587,50 +587,26 @@ public static Vector128<short> HorizontalAdd(Vector128<short> left, Vector128<sh
587587
return AdvSimd.Arm64.AddPairwise(left, right);
588588
}
589589

590-
// Extract the low and high parts of the products shuffling them to form a result we can add together.
591-
// Use out-of-bounds to zero out the unused lanes.
592-
Vector128<short> even = Vector128.Create(0, 2, 4, 6, 8, 8, 8, 8);
593-
Vector128<short> odd = Vector128.Create(1, 3, 5, 7, 8, 8, 8, 8);
594-
Vector128<short> v0 = Vector128.Shuffle(right, even);
595-
Vector128<short> v1 = Vector128.Shuffle(right, odd);
596-
Vector128<short> v2 = Vector128.Shuffle(left, even);
597-
Vector128<short> v3 = Vector128.Shuffle(left, odd);
598-
599-
return v0 + v1 + v2 + v3;
600-
}
601-
602-
/// <summary>
603-
/// Multiply the packed 16-bit integers in <paramref name="left"/> and <paramref name="right"/>, producing
604-
/// intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in the result.
605-
/// </summary>
606-
/// <param name="left">
607-
/// The first vector containing packed 16-bit integers to multiply.
608-
/// </param>
609-
/// <param name="right">
610-
/// The second vector containing packed 16-bit integers to multiply.
611-
/// </param>
612-
/// <returns>
613-
/// A vector containing the low 16 bits of the products of the packed 16-bit integers
614-
/// from <paramref name="left"/> and <paramref name="right"/>.
615-
/// </returns>
616-
[MethodImpl(MethodImplOptions.AggressiveInlining)]
617-
public static Vector128<short> MultiplyLow(Vector128<short> left, Vector128<short> right)
618-
{
619-
if (Sse2.IsSupported)
590+
if (AdvSimd.IsSupported)
620591
{
621-
return Sse2.MultiplyLow(left, right);
622-
}
592+
Vector128<int> v0 = AdvSimd.AddPairwiseWidening(left);
593+
Vector128<int> v1 = AdvSimd.AddPairwiseWidening(right);
623594

624-
// Widen each half of the short vectors into two int vectors
625-
(Vector128<int> leftLo, Vector128<int> leftHi) = Vector128.Widen(left);
626-
(Vector128<int> rightLo, Vector128<int> rightHi) = Vector128.Widen(right);
595+
return Vector128.Narrow(v0, v1);
596+
}
627597

628-
// Elementwise multiply: each int lane now holds the full 32-bit product
629-
Vector128<int> prodLo = leftLo * rightLo;
630-
Vector128<int> prodHi = leftHi * rightHi;
598+
{
599+
// Extract the low and high parts of the products shuffling them to form a result we can add together.
600+
// Use out-of-bounds to zero out the unused lanes.
601+
Vector128<short> even = Vector128.Create(0, 2, 4, 6, 8, 8, 8, 8);
602+
Vector128<short> odd = Vector128.Create(1, 3, 5, 7, 8, 8, 8, 8);
603+
Vector128<short> v0 = Vector128.Shuffle(right, even);
604+
Vector128<short> v1 = Vector128.Shuffle(right, odd);
605+
Vector128<short> v2 = Vector128.Shuffle(left, even);
606+
Vector128<short> v3 = Vector128.Shuffle(left, odd);
631607

632-
// Narrow the two int vectors back into one short vector
633-
return Vector128.Narrow(prodLo, prodHi);
608+
return v0 + v1 + v2 + v3;
609+
}
634610
}
635611

636612
/// <summary>
@@ -655,20 +631,33 @@ public static Vector128<short> MultiplyHigh(Vector128<short> left, Vector128<sho
655631
return Sse2.MultiplyHigh(left, right);
656632
}
657633

658-
// Widen each half of the short vectors into two int vectors
659-
(Vector128<int> leftLo, Vector128<int> leftHi) = Vector128.Widen(left);
660-
(Vector128<int> rightLo, Vector128<int> rightHi) = Vector128.Widen(right);
634+
if (AdvSimd.IsSupported)
635+
{
636+
Vector128<int> prodLo = AdvSimd.MultiplyWideningLower(left.GetLower(), right.GetLower());
637+
Vector128<int> prodHi = AdvSimd.MultiplyWideningUpper(left, right);
638+
639+
prodLo >>= 16;
640+
prodHi >>= 16;
641+
642+
return Vector128.Narrow(prodLo, prodHi);
643+
}
644+
645+
{
646+
// Widen each half of the short vectors into two int vectors
647+
(Vector128<int> leftLo, Vector128<int> leftHi) = Vector128.Widen(left);
648+
(Vector128<int> rightLo, Vector128<int> rightHi) = Vector128.Widen(right);
661649

662-
// Elementwise multiply: each int lane now holds the full 32-bit product
663-
Vector128<int> prodLo = leftLo * rightLo;
664-
Vector128<int> prodHi = leftHi * rightHi;
650+
// Elementwise multiply: each int lane now holds the full 32-bit product
651+
Vector128<int> prodLo = leftLo * rightLo;
652+
Vector128<int> prodHi = leftHi * rightHi;
665653

666-
// Arithmetic shift right by 16 bits to extract the high word
667-
prodLo >>= 16;
668-
prodHi >>= 16;
654+
// Arithmetic shift right by 16 bits to extract the high word
655+
prodLo >>= 16;
656+
prodHi >>= 16;
669657

670-
// Narrow the two int vectors back into one short vector
671-
return Vector128.Narrow(prodLo, prodHi);
658+
// Narrow the two int vectors back into one short vector
659+
return Vector128.Narrow(prodLo, prodHi);
660+
}
672661
}
673662

674663
/// <summary>
@@ -693,20 +682,33 @@ public static Vector128<ushort> MultiplyHigh(Vector128<ushort> left, Vector128<u
693682
return Sse2.MultiplyHigh(left, right);
694683
}
695684

696-
// Widen each half of the short vectors into two uint vectors
697-
(Vector128<uint> leftLo, Vector128<uint> leftHi) = Vector128.Widen(left);
698-
(Vector128<uint> rightLo, Vector128<uint> rightHi) = Vector128.Widen(right);
685+
if (AdvSimd.IsSupported)
686+
{
687+
Vector128<uint> prodLo = AdvSimd.MultiplyWideningLower(left.GetLower(), right.GetLower());
688+
Vector128<uint> prodHi = AdvSimd.MultiplyWideningUpper(left, right);
699689

700-
// Elementwise multiply: each int lane now holds the full 32-bit product
701-
Vector128<uint> prodLo = leftLo * rightLo;
702-
Vector128<uint> prodHi = leftHi * rightHi;
690+
prodLo >>= 16;
691+
prodHi >>= 16;
703692

704-
// Arithmetic shift right by 16 bits to extract the high word
705-
prodLo >>= 16;
706-
prodHi >>= 16;
693+
return Vector128.Narrow(prodLo, prodHi);
694+
}
695+
696+
{
697+
// Widen each half of the short vectors into two uint vectors
698+
(Vector128<uint> leftLo, Vector128<uint> leftHi) = Vector128.Widen(left);
699+
(Vector128<uint> rightLo, Vector128<uint> rightHi) = Vector128.Widen(right);
707700

708-
// Narrow the two int vectors back into one short vector
709-
return Vector128.Narrow(prodLo, prodHi);
701+
// Elementwise multiply: each int lane now holds the full 32-bit product
702+
Vector128<uint> prodLo = leftLo * rightLo;
703+
Vector128<uint> prodHi = leftHi * rightHi;
704+
705+
// Arithmetic shift right by 16 bits to extract the high word
706+
prodLo >>= 16;
707+
prodHi >>= 16;
708+
709+
// Narrow the two int vectors back into one short vector
710+
return Vector128.Narrow(prodLo, prodHi);
711+
}
710712
}
711713

712714
/// <summary>
@@ -1363,90 +1365,4 @@ public static Vector128<sbyte> SubtractSaturate(Vector128<sbyte> left, Vector128
13631365
// Narrow back to signed bytes
13641366
return Vector128.Narrow(diffLo, diffHi);
13651367
}
1366-
1367-
/// <summary>
1368-
/// Create mask from the most significant bit of each 8-bit element in <paramref name="value"/>, and store the result.
1369-
/// </summary>
1370-
/// <param name="value">
1371-
/// The vector containing packed 8-bit integers from which to create the mask.
1372-
/// </param>
1373-
/// <returns>
1374-
/// A 16-bit integer mask where each bit corresponds to the most significant bit of each 8-bit element
1375-
/// in <paramref name="value"/>.
1376-
/// </returns>
1377-
[MethodImpl(MethodImplOptions.AggressiveInlining)]
1378-
public static int MoveMask(Vector128<byte> value)
1379-
{
1380-
if (Sse2.IsSupported)
1381-
{
1382-
return Sse2.MoveMask(value);
1383-
}
1384-
1385-
// AdvSimd versions ported from Stack Overflow answer:
1386-
// https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon
1387-
if (AdvSimd.Arm64.IsSupported)
1388-
{
1389-
// Shift values to align each MSB to its corresponding bit in the output
1390-
Vector128<sbyte> shift = Vector128.Create(-7, -6, -5, -4, -3, -2, -1, 0, -7, -6, -5, -4, -3, -2, -1, 0);
1391-
1392-
// Mask to isolate MSBs
1393-
Vector128<byte> msbMask = Vector128.Create((byte)0x80);
1394-
Vector128<byte> masked = value & msbMask;
1395-
1396-
// Shift each MSB into the correct bit position
1397-
Vector128<byte> shifted = AdvSimd.ShiftLogical(masked.AsSByte(), shift).AsByte();
1398-
1399-
// Sum lanes: lower 8 go into bits 0–7, upper 8 go into bits 8–15
1400-
byte lo = AdvSimd.Arm64.AddAcross(shifted.GetLower()).ToScalar();
1401-
byte hi = AdvSimd.Arm64.AddAcross(shifted.GetUpper()).ToScalar();
1402-
1403-
return lo + (hi << 8);
1404-
}
1405-
1406-
if (AdvSimd.IsSupported)
1407-
{
1408-
Vector128<byte> powers = Vector128.Create(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
1409-
Vector128<byte> msbMask = Vector128.Create((byte)0x80);
1410-
Vector128<byte> normalized = AdvSimd.CompareEqual(value & msbMask, msbMask); // 0xFF or 0x00
1411-
Vector128<byte> masked = normalized & powers;
1412-
1413-
Vector128<ushort> sum8 = AdvSimd.AddPairwiseWidening(masked);
1414-
Vector128<uint> sum16 = AdvSimd.AddPairwiseWidening(sum8);
1415-
Vector128<ulong> sum32 = AdvSimd.AddPairwiseWidening(sum16);
1416-
1417-
// Extract lower 8 bits of each 64-bit lane
1418-
byte lo = sum32.AsByte().GetElement(0);
1419-
byte hi = sum32.AsByte().GetElement(8);
1420-
1421-
return (hi << 8) | lo;
1422-
}
1423-
1424-
{
1425-
// Step 1: isolate MSBs
1426-
Vector128<byte> msbMask = Vector128.Create((byte)0x80);
1427-
Vector128<byte> masked = value & msbMask;
1428-
1429-
// Step 2: shift each byte so MSB lands in bit position [0..15]
1430-
// i.e. convert: 0x80 → 1 << i
1431-
Vector128<ushort> bitShifts = Vector128.Create((ushort)1, 2, 4, 8, 16, 32, 64, 128);
1432-
Vector128<ushort> bitShiftsHigh = Vector128.Create(256, 512, 1024, 2048, 4096, 8192, 16384, 32768);
1433-
1434-
// Step 3: widen to ushort
1435-
(Vector128<ushort> lo, Vector128<ushort> hi) = Vector128.Widen(masked);
1436-
1437-
// Step 4: compare > 0 to get 0xFFFF where MSB was set
1438-
lo = Vector128.ConditionalSelect(Vector128.Equals(lo, Vector128<ushort>.Zero), Vector128<ushort>.Zero, bitShifts);
1439-
hi = Vector128.ConditionalSelect(Vector128.Equals(hi, Vector128<ushort>.Zero), Vector128<ushort>.Zero, bitShiftsHigh);
1440-
1441-
// Step 5: bitwise OR the two halves
1442-
Vector128<ushort> maskVector = lo | hi;
1443-
1444-
// Step 6: horizontal OR reduction via shuffles
1445-
maskVector |= Vector128.Shuffle(maskVector, Vector128.Create((ushort)4, 5, 6, 7, 0, 1, 2, 3));
1446-
maskVector |= Vector128.Shuffle(maskVector, Vector128.Create((ushort)2, 3, 0, 1, 6, 7, 4, 5));
1447-
maskVector |= Vector128.Shuffle(maskVector, Vector128.Create((ushort)1, 0, 3, 2, 5, 4, 7, 6));
1448-
1449-
return maskVector.ToScalar();
1450-
}
1451-
}
14521368
}

0 commit comments

Comments
 (0)