66using System . Runtime . CompilerServices ;
77using System . Runtime . InteropServices ;
88using System . Runtime . Intrinsics ;
9+ using System . Runtime . Intrinsics . Arm ;
910using System . Runtime . Intrinsics . X86 ;
1011using SixLabors . ImageSharp . Memory ;
1112
@@ -160,8 +161,41 @@ static void RoundY(ReadOnlySpan<PointF> vertices, Span<float> destination, float
160161 }
161162 }
162163 }
164+ else if ( AdvSimd . IsSupported )
165+ {
166+ // If the length of the input buffer as a float array is a multiple of 8, we can use AdvSimd instructions:
167+ int verticesLengthInFloats = vertices . Length * 2 ;
168+ int vector128FloatCount_x2 = Vector128 < float > . Count * 2 ;
169+ int remainder = verticesLengthInFloats % vector128FloatCount_x2 ;
170+ int verticesLength = verticesLengthInFloats - remainder ;
171+
172+ if ( verticesLength > 0 )
173+ {
174+ ri = vertices . Length - ( remainder / 2 ) ;
175+ float maxIterations = verticesLength / ( Vector128 < float > . Count * 2 ) ;
176+ ref Vector128 < float > sourceBase = ref Unsafe . As < PointF , Vector128 < float > > ( ref MemoryMarshal . GetReference ( vertices ) ) ;
177+ ref Vector128 < float > destinationBase = ref Unsafe . As < float , Vector128 < float > > ( ref MemoryMarshal . GetReference ( destination ) ) ;
178+
179+ Vector128 < float > ssRatio = Vector128 . Create ( subsamplingRatio ) ;
180+ Vector128 < float > inverseSsRatio = Vector128 . Create ( 1F / subsamplingRatio ) ;
181+
182+ // For every 1 vector we add to the destination we read 2 from the vertices.
183+ for ( nint i = 0 , j = 0 ; i < maxIterations ; i ++ , j += 2 )
184+ {
185+ // Load 4 PointF
186+ Vector128 < float > points1 = Unsafe . Add ( ref sourceBase , j ) ;
187+ Vector128 < float > points2 = Unsafe . Add ( ref sourceBase , j + 1 ) ;
188+
189+ // Shuffle the points to group the Y properties
190+ Vector128 < float > pointsY = AdvSimdShuffle ( points1 , points2 , 0b11_01_11_01 ) ;
191+
192+ // Multiply by the subsampling ratio, round, then multiply by the inverted subsampling ratio and assign.
193+ Vector128 < float > rounded = AdvSimd . RoundAwayFromZero ( Sse . Multiply ( pointsY , ssRatio ) ) ;
194+ Unsafe . Add ( ref destinationBase , i ) = AdvSimd. Multiply ( rounded , inverseSsRatio ) ;
195+ }
196+ }
197+ }
163198
164- // TODO: Arm64
165199 for ( ; ri < vertices . Length ; ri ++ )
166200 {
167201 destination [ ri ] = MathF . Round ( vertices [ ri ] . Y * subsamplingRatio , MidpointRounding . AwayFromZero ) / subsamplingRatio ;
@@ -171,6 +205,17 @@ static void RoundY(ReadOnlySpan<PointF> vertices, Span<float> destination, float
171205 return new ScanEdgeCollection ( buffer , walker . EdgeCounter ) ;
172206 }
173207
208+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
209+ private static Vector128 < float > AdvSimdShuffle ( Vector128 < float > a , Vector128 < float > b , byte control )
210+ {
211+ Vector128 < float > result = Vector128 . Create ( AdvSimd . Extract ( a , ( byte ) ( control & 0x3 ) ) ) ;
212+ result = AdvSimd . Insert ( result , 1 , AdvSimd . Extract ( a , ( byte ) ( ( control >> 2 ) & 0x3 ) ) ) ;
213+ result = AdvSimd . Insert ( result , 2 , AdvSimd . Extract ( b , ( byte ) ( ( control >> 4 ) & 0x3 ) ) ) ;
214+ result = AdvSimd . Insert ( result , 3 , AdvSimd . Extract ( b , ( byte ) ( ( control >> 6 ) & 0x3 ) ) ) ;
215+
216+ return result ;
217+ }
218+
174219 [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
175220 private static VertexCategory CreateVertexCategory ( EdgeCategory previousCategory , EdgeCategory currentCategory )
176221 {
0 commit comments