@@ -47,8 +47,10 @@ public static Vector128<byte> Average(Vector128<byte> left, Vector128<byte> righ
47
47
return AdvSimd . FusedAddRoundedHalving ( left , right ) ;
48
48
}
49
49
50
- // Portable fallback: (a + b + 1) >> 1
51
- return ( left + right + Vector128 . Create ( ( byte ) 1 ) ) >> 1 ;
50
+ // Account for potential 9th bit to ensure correct rounded result.
51
+ return Vector128 . Narrow (
52
+ ( Vector128 . WidenLower ( left ) + Vector128 . WidenLower ( right ) + Vector128 < ushort > . One ) >> 1 ,
53
+ ( Vector128 . WidenUpper ( left ) + Vector128 . WidenUpper ( right ) + Vector128 < ushort > . One ) >> 1 ) ;
52
54
}
53
55
54
56
/// <summary>
@@ -117,13 +119,17 @@ public static Vector128<short> ShuffleHigh(Vector128<short> value, [ConstantExpe
117
119
}
118
120
119
121
// Don't use InverseMMShuffle here as we want to avoid the cast.
120
- Vector64 < short > indices = Vector64 . Create (
121
- ( short ) ( control & 0x3 ) ,
122
- ( short ) ( ( control >> 2 ) & 0x3 ) ,
123
- ( short ) ( ( control >> 4 ) & 0x3 ) ,
124
- ( short ) ( ( control >> 6 ) & 0x3 ) ) ;
125
-
126
- return Vector128 . Create ( value . GetLower ( ) , Vector64 . Shuffle ( value . GetUpper ( ) , indices ) ) ;
122
+ Vector128 < short > indices = Vector128 . Create (
123
+ 0 ,
124
+ 1 ,
125
+ 2 ,
126
+ 3 ,
127
+ ( short ) ( ( control & 0x3 ) + 4 ) ,
128
+ ( short ) ( ( ( control >> 2 ) & 0x3 ) + 4 ) ,
129
+ ( short ) ( ( ( control >> 4 ) & 0x3 ) + 4 ) ,
130
+ ( short ) ( ( ( control >> 6 ) & 0x3 ) + 4 ) ) ;
131
+
132
+ return Vector128 . Shuffle ( value , indices ) ;
127
133
}
128
134
129
135
/// <summary>
@@ -144,13 +150,17 @@ public static Vector128<short> ShuffleLow(Vector128<short> value, [ConstantExpec
144
150
}
145
151
146
152
// Don't use InverseMMShuffle here as we want to avoid the cast.
147
- Vector64 < short > indices = Vector64 . Create (
148
- ( short ) ( control & 0x3 ) ,
149
- ( short ) ( ( control >> 2 ) & 0x3 ) ,
150
- ( short ) ( ( control >> 4 ) & 0x3 ) ,
151
- ( short ) ( ( control >> 6 ) & 0x3 ) ) ;
152
-
153
- return Vector128 . Create ( Vector64 . Shuffle ( value . GetLower ( ) , indices ) , value . GetUpper ( ) ) ;
153
+ Vector128 < short > indices = Vector128 . Create (
154
+ ( short ) ( control & 0x3 ) ,
155
+ ( short ) ( ( control >> 2 ) & 0x3 ) ,
156
+ ( short ) ( ( control >> 4 ) & 0x3 ) ,
157
+ ( short ) ( ( control >> 6 ) & 0x3 ) ,
158
+ 4 ,
159
+ 5 ,
160
+ 6 ,
161
+ 7 ) ;
162
+
163
+ return Vector128 . Shuffle ( value , indices ) ;
154
164
}
155
165
156
166
/// <summary>
@@ -237,28 +247,13 @@ public static Vector128<byte> ShiftLeftBytesInVector(Vector128<byte> value, [Con
237
247
[ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
238
248
public static Vector128 < short > ShiftLeftLogical ( Vector128 < short > value , [ ConstantExpected ] byte count )
239
249
{
240
- if ( Sse2 . IsSupported )
241
- {
242
- return Sse2 . ShiftLeftLogical ( value , count ) ;
243
- }
244
-
245
250
// Zero lanes where count >= 16 to match SSE2
246
251
if ( count >= 16 )
247
252
{
248
253
return Vector128 < short > . Zero ;
249
254
}
250
255
251
- if ( AdvSimd . IsSupported )
252
- {
253
- return AdvSimd . ShiftLogical ( value , Vector128 . Create ( ( short ) count ) ) ;
254
- }
255
-
256
- if ( PackedSimd . IsSupported )
257
- {
258
- return PackedSimd . ShiftLeft ( value , count ) ;
259
- }
260
-
261
- return Vector128 . ShiftLeft ( value , count ) ;
256
+ return value << count ;
262
257
}
263
258
264
259
/// <summary>
@@ -536,6 +531,11 @@ public static Vector128<int> MultiplyAddAdjacent(Vector128<short> left, Vector12
536
531
Vector128 < int > prodLo = AdvSimd . MultiplyWideningLower ( left . GetLower ( ) , right . GetLower ( ) ) ;
537
532
Vector128 < int > prodHi = AdvSimd . MultiplyWideningLower ( left . GetUpper ( ) , right . GetUpper ( ) ) ;
538
533
534
+ if ( AdvSimd . Arm64 . IsSupported )
535
+ {
536
+ return AdvSimd . Arm64 . AddPairwise ( prodLo , prodHi ) ;
537
+ }
538
+
539
539
Vector128 < long > v0 = AdvSimd . AddPairwiseWidening ( prodLo ) ;
540
540
Vector128 < long > v1 = AdvSimd . AddPairwiseWidening ( prodHi ) ;
541
541
@@ -587,50 +587,26 @@ public static Vector128<short> HorizontalAdd(Vector128<short> left, Vector128<sh
587
587
return AdvSimd . Arm64 . AddPairwise ( left , right ) ;
588
588
}
589
589
590
- // Extract the low and high parts of the products shuffling them to form a result we can add together.
591
- // Use out-of-bounds to zero out the unused lanes.
592
- Vector128 < short > even = Vector128 . Create ( 0 , 2 , 4 , 6 , 8 , 8 , 8 , 8 ) ;
593
- Vector128 < short > odd = Vector128 . Create ( 1 , 3 , 5 , 7 , 8 , 8 , 8 , 8 ) ;
594
- Vector128 < short > v0 = Vector128 . Shuffle ( right , even ) ;
595
- Vector128 < short > v1 = Vector128 . Shuffle ( right , odd ) ;
596
- Vector128 < short > v2 = Vector128 . Shuffle ( left , even ) ;
597
- Vector128 < short > v3 = Vector128 . Shuffle ( left , odd ) ;
598
-
599
- return v0 + v1 + v2 + v3 ;
600
- }
601
-
602
- /// <summary>
603
- /// Multiply the packed 16-bit integers in <paramref name="left"/> and <paramref name="right"/>, producing
604
- /// intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in the result.
605
- /// </summary>
606
- /// <param name="left">
607
- /// The first vector containing packed 16-bit integers to multiply.
608
- /// </param>
609
- /// <param name="right">
610
- /// The second vector containing packed 16-bit integers to multiply.
611
- /// </param>
612
- /// <returns>
613
- /// A vector containing the low 16 bits of the products of the packed 16-bit integers
614
- /// from <paramref name="left"/> and <paramref name="right"/>.
615
- /// </returns>
616
- [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
617
- public static Vector128 < short > MultiplyLow ( Vector128 < short > left , Vector128 < short > right )
618
- {
619
- if ( Sse2 . IsSupported )
590
+ if ( AdvSimd . IsSupported )
620
591
{
621
- return Sse2 . MultiplyLow ( left , right ) ;
622
- }
592
+ Vector128 < int > v0 = AdvSimd . AddPairwiseWidening ( left ) ;
593
+ Vector128 < int > v1 = AdvSimd . AddPairwiseWidening ( right ) ;
623
594
624
- // Widen each half of the short vectors into two int vectors
625
- ( Vector128 < int > leftLo , Vector128 < int > leftHi ) = Vector128 . Widen ( left ) ;
626
- ( Vector128 < int > rightLo , Vector128 < int > rightHi ) = Vector128 . Widen ( right ) ;
595
+ return Vector128 . Narrow ( v0 , v1 ) ;
596
+ }
627
597
628
- // Elementwise multiply: each int lane now holds the full 32-bit product
629
- Vector128 < int > prodLo = leftLo * rightLo ;
630
- Vector128 < int > prodHi = leftHi * rightHi ;
598
+ {
599
+ // Extract the low and high parts of the products shuffling them to form a result we can add together.
600
+ // Use out-of-bounds to zero out the unused lanes.
601
+ Vector128 < short > even = Vector128 . Create ( 0 , 2 , 4 , 6 , 8 , 8 , 8 , 8 ) ;
602
+ Vector128 < short > odd = Vector128 . Create ( 1 , 3 , 5 , 7 , 8 , 8 , 8 , 8 ) ;
603
+ Vector128 < short > v0 = Vector128 . Shuffle ( right , even ) ;
604
+ Vector128 < short > v1 = Vector128 . Shuffle ( right , odd ) ;
605
+ Vector128 < short > v2 = Vector128 . Shuffle ( left , even ) ;
606
+ Vector128 < short > v3 = Vector128 . Shuffle ( left , odd ) ;
631
607
632
- // Narrow the two int vectors back into one short vector
633
- return Vector128 . Narrow ( prodLo , prodHi ) ;
608
+ return v0 + v1 + v2 + v3 ;
609
+ }
634
610
}
635
611
636
612
/// <summary>
@@ -655,20 +631,33 @@ public static Vector128<short> MultiplyHigh(Vector128<short> left, Vector128<sho
655
631
return Sse2 . MultiplyHigh ( left , right ) ;
656
632
}
657
633
658
- // Widen each half of the short vectors into two int vectors
659
- ( Vector128 < int > leftLo , Vector128 < int > leftHi ) = Vector128 . Widen ( left ) ;
660
- ( Vector128 < int > rightLo , Vector128 < int > rightHi ) = Vector128 . Widen ( right ) ;
634
+ if ( AdvSimd . IsSupported )
635
+ {
636
+ Vector128 < int > prodLo = AdvSimd . MultiplyWideningLower ( left . GetLower ( ) , right . GetLower ( ) ) ;
637
+ Vector128 < int > prodHi = AdvSimd . MultiplyWideningUpper ( left , right ) ;
638
+
639
+ prodLo >>= 16 ;
640
+ prodHi >>= 16 ;
641
+
642
+ return Vector128 . Narrow ( prodLo , prodHi ) ;
643
+ }
644
+
645
+ {
646
+ // Widen each half of the short vectors into two int vectors
647
+ ( Vector128 < int > leftLo , Vector128 < int > leftHi ) = Vector128 . Widen ( left ) ;
648
+ ( Vector128 < int > rightLo , Vector128 < int > rightHi ) = Vector128 . Widen ( right ) ;
661
649
662
- // Elementwise multiply: each int lane now holds the full 32-bit product
663
- Vector128 < int > prodLo = leftLo * rightLo ;
664
- Vector128 < int > prodHi = leftHi * rightHi ;
650
+ // Elementwise multiply: each int lane now holds the full 32-bit product
651
+ Vector128 < int > prodLo = leftLo * rightLo ;
652
+ Vector128 < int > prodHi = leftHi * rightHi ;
665
653
666
- // Arithmetic shift right by 16 bits to extract the high word
667
- prodLo >>= 16 ;
668
- prodHi >>= 16 ;
654
+ // Arithmetic shift right by 16 bits to extract the high word
655
+ prodLo >>= 16 ;
656
+ prodHi >>= 16 ;
669
657
670
- // Narrow the two int vectors back into one short vector
671
- return Vector128 . Narrow ( prodLo , prodHi ) ;
658
+ // Narrow the two int vectors back into one short vector
659
+ return Vector128 . Narrow ( prodLo , prodHi ) ;
660
+ }
672
661
}
673
662
674
663
/// <summary>
@@ -693,20 +682,33 @@ public static Vector128<ushort> MultiplyHigh(Vector128<ushort> left, Vector128<u
693
682
return Sse2 . MultiplyHigh ( left , right ) ;
694
683
}
695
684
696
- // Widen each half of the short vectors into two uint vectors
697
- ( Vector128 < uint > leftLo , Vector128 < uint > leftHi ) = Vector128 . Widen ( left ) ;
698
- ( Vector128 < uint > rightLo , Vector128 < uint > rightHi ) = Vector128 . Widen ( right ) ;
685
+ if ( AdvSimd . IsSupported )
686
+ {
687
+ Vector128 < uint > prodLo = AdvSimd . MultiplyWideningLower ( left . GetLower ( ) , right . GetLower ( ) ) ;
688
+ Vector128 < uint > prodHi = AdvSimd . MultiplyWideningUpper ( left , right ) ;
699
689
700
- // Elementwise multiply: each int lane now holds the full 32-bit product
701
- Vector128 < uint > prodLo = leftLo * rightLo ;
702
- Vector128 < uint > prodHi = leftHi * rightHi ;
690
+ prodLo >>= 16 ;
691
+ prodHi >>= 16 ;
703
692
704
- // Arithmetic shift right by 16 bits to extract the high word
705
- prodLo >>= 16 ;
706
- prodHi >>= 16 ;
693
+ return Vector128 . Narrow ( prodLo , prodHi ) ;
694
+ }
695
+
696
+ {
697
+ // Widen each half of the short vectors into two uint vectors
698
+ ( Vector128 < uint > leftLo , Vector128 < uint > leftHi ) = Vector128 . Widen ( left ) ;
699
+ ( Vector128 < uint > rightLo , Vector128 < uint > rightHi ) = Vector128 . Widen ( right ) ;
707
700
708
- // Narrow the two int vectors back into one short vector
709
- return Vector128 . Narrow ( prodLo , prodHi ) ;
701
+ // Elementwise multiply: each int lane now holds the full 32-bit product
702
+ Vector128 < uint > prodLo = leftLo * rightLo ;
703
+ Vector128 < uint > prodHi = leftHi * rightHi ;
704
+
705
+ // Arithmetic shift right by 16 bits to extract the high word
706
+ prodLo >>= 16 ;
707
+ prodHi >>= 16 ;
708
+
709
+ // Narrow the two int vectors back into one short vector
710
+ return Vector128 . Narrow ( prodLo , prodHi ) ;
711
+ }
710
712
}
711
713
712
714
/// <summary>
@@ -1363,90 +1365,4 @@ public static Vector128<sbyte> SubtractSaturate(Vector128<sbyte> left, Vector128
1363
1365
// Narrow back to signed bytes
1364
1366
return Vector128 . Narrow ( diffLo , diffHi ) ;
1365
1367
}
1366
-
1367
- /// <summary>
1368
- /// Create mask from the most significant bit of each 8-bit element in <paramref name="value"/>, and store the result.
1369
- /// </summary>
1370
- /// <param name="value">
1371
- /// The vector containing packed 8-bit integers from which to create the mask.
1372
- /// </param>
1373
- /// <returns>
1374
- /// A 16-bit integer mask where each bit corresponds to the most significant bit of each 8-bit element
1375
- /// in <paramref name="value"/>.
1376
- /// </returns>
1377
- [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
1378
- public static int MoveMask ( Vector128 < byte > value )
1379
- {
1380
- if ( Sse2 . IsSupported )
1381
- {
1382
- return Sse2 . MoveMask ( value ) ;
1383
- }
1384
-
1385
- // AdvSimd versions ported from Stack Overflow answer:
1386
- // https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon
1387
- if ( AdvSimd . Arm64 . IsSupported )
1388
- {
1389
- // Shift values to align each MSB to its corresponding bit in the output
1390
- Vector128 < sbyte > shift = Vector128 . Create ( - 7 , - 6 , - 5 , - 4 , - 3 , - 2 , - 1 , 0 , - 7 , - 6 , - 5 , - 4 , - 3 , - 2 , - 1 , 0 ) ;
1391
-
1392
- // Mask to isolate MSBs
1393
- Vector128 < byte > msbMask = Vector128 . Create ( ( byte ) 0x80 ) ;
1394
- Vector128 < byte > masked = value & msbMask ;
1395
-
1396
- // Shift each MSB into the correct bit position
1397
- Vector128 < byte > shifted = AdvSimd . ShiftLogical ( masked . AsSByte ( ) , shift ) . AsByte ( ) ;
1398
-
1399
- // Sum lanes: lower 8 go into bits 0–7, upper 8 go into bits 8–15
1400
- byte lo = AdvSimd . Arm64 . AddAcross ( shifted . GetLower ( ) ) . ToScalar ( ) ;
1401
- byte hi = AdvSimd . Arm64 . AddAcross ( shifted . GetUpper ( ) ) . ToScalar ( ) ;
1402
-
1403
- return lo + ( hi << 8 ) ;
1404
- }
1405
-
1406
- if ( AdvSimd . IsSupported )
1407
- {
1408
- Vector128 < byte > powers = Vector128 . Create ( 1 , 2 , 4 , 8 , 16 , 32 , 64 , 128 , 1 , 2 , 4 , 8 , 16 , 32 , 64 , 128 ) ;
1409
- Vector128 < byte > msbMask = Vector128 . Create ( ( byte ) 0x80 ) ;
1410
- Vector128 < byte > normalized = AdvSimd . CompareEqual ( value & msbMask , msbMask ) ; // 0xFF or 0x00
1411
- Vector128 < byte > masked = normalized & powers ;
1412
-
1413
- Vector128 < ushort > sum8 = AdvSimd . AddPairwiseWidening ( masked ) ;
1414
- Vector128 < uint > sum16 = AdvSimd . AddPairwiseWidening ( sum8 ) ;
1415
- Vector128 < ulong > sum32 = AdvSimd . AddPairwiseWidening ( sum16 ) ;
1416
-
1417
- // Extract lower 8 bits of each 64-bit lane
1418
- byte lo = sum32 . AsByte ( ) . GetElement ( 0 ) ;
1419
- byte hi = sum32 . AsByte ( ) . GetElement ( 8 ) ;
1420
-
1421
- return ( hi << 8 ) | lo ;
1422
- }
1423
-
1424
- {
1425
- // Step 1: isolate MSBs
1426
- Vector128 < byte > msbMask = Vector128 . Create ( ( byte ) 0x80 ) ;
1427
- Vector128 < byte > masked = value & msbMask ;
1428
-
1429
- // Step 2: shift each byte so MSB lands in bit position [0..15]
1430
- // i.e. convert: 0x80 → 1 << i
1431
- Vector128 < ushort > bitShifts = Vector128 . Create ( ( ushort ) 1 , 2 , 4 , 8 , 16 , 32 , 64 , 128 ) ;
1432
- Vector128 < ushort > bitShiftsHigh = Vector128 . Create ( 256 , 512 , 1024 , 2048 , 4096 , 8192 , 16384 , 32768 ) ;
1433
-
1434
- // Step 3: widen to ushort
1435
- ( Vector128 < ushort > lo , Vector128 < ushort > hi ) = Vector128 . Widen ( masked ) ;
1436
-
1437
- // Step 4: compare > 0 to get 0xFFFF where MSB was set
1438
- lo = Vector128 . ConditionalSelect ( Vector128 . Equals ( lo , Vector128 < ushort > . Zero ) , Vector128 < ushort > . Zero , bitShifts ) ;
1439
- hi = Vector128 . ConditionalSelect ( Vector128 . Equals ( hi , Vector128 < ushort > . Zero ) , Vector128 < ushort > . Zero , bitShiftsHigh ) ;
1440
-
1441
- // Step 5: bitwise OR the two halves
1442
- Vector128 < ushort > maskVector = lo | hi ;
1443
-
1444
- // Step 6: horizontal OR reduction via shuffles
1445
- maskVector |= Vector128 . Shuffle ( maskVector , Vector128 . Create ( ( ushort ) 4 , 5 , 6 , 7 , 0 , 1 , 2 , 3 ) ) ;
1446
- maskVector |= Vector128 . Shuffle ( maskVector , Vector128 . Create ( ( ushort ) 2 , 3 , 0 , 1 , 6 , 7 , 4 , 5 ) ) ;
1447
- maskVector |= Vector128 . Shuffle ( maskVector , Vector128 . Create ( ( ushort ) 1 , 0 , 3 , 2 , 5 , 4 , 7 , 6 ) ) ;
1448
-
1449
- return maskVector . ToScalar ( ) ;
1450
- }
1451
- }
1452
1368
}
0 commit comments