1+ #if NET6_0_OR_GREATER
2+ using System ;
3+ using System . Numerics ;
4+ using System . Runtime . Intrinsics ;
5+ using System . Runtime . Intrinsics . X86 ;
6+
7+ namespace SharpGLTF . Schema2
8+ {
9+ /// <summary>
10+ /// Somewhat optimized version of finding min/max values in a vector of floats. Please note some effort
11+ /// has been made to test a multithreaded version of this as well, but it was not faster than this implementation
12+ /// for the data sets it was tested against. If anybody feels so inclined, please feel free to try and improve
13+ /// this further.
14+ /// </summary>
15+ public static class VectorMinMax
16+ {
17+ public static ( float [ ] min , float [ ] max ) FindMinMax ( ReadOnlySpan < float > data , int dimensions )
18+ {
19+ if ( data . Length % dimensions != 0 )
20+ throw new ArgumentException ( $ "Data length must be divisible by { dimensions } ") ;
21+
22+ var min = new float [ dimensions ] ;
23+ var max = new float [ dimensions ] ;
24+ Array . Fill ( min , float . MaxValue ) ;
25+ Array . Fill ( max , float . MinValue ) ;
26+
27+
28+ if ( dimensions == 3 && data . Length >= 24 )
29+ {
30+ // Special optimized path for 3D vectors
31+ ProcessSIMD3D ( data , min , max ) ;
32+ } else
33+ {
34+ // General case for other dimensions
35+ ProcessSIMD ( data , dimensions , min , max ) ;
36+ }
37+
38+ return ( min , max ) ;
39+ }
40+
41+ // ReSharper disable once InconsistentNaming
42+ private static unsafe void ProcessSIMD ( ReadOnlySpan < float > data , int dimensions , float [ ] min , float [ ] max )
43+ {
44+ fixed ( float * ptr = data )
45+ {
46+ if ( Avx2 . IsSupported && data . Length >= dimensions * 8 )
47+ {
48+ // intel processors, 8 floats = 256 bits
49+ ProcessWithAVX ( ptr , data . Length , dimensions , min , max ) ;
50+ } else if ( Vector . IsHardwareAccelerated && data . Length >= dimensions * Vector < float > . Count )
51+ {
52+ // on arm / apple silicon etc, Vector<float>.Count usually == 4. 4 floats = 128 bits
53+ ProcessWithVector ( ptr , data . Length , dimensions , min , max ) ;
54+ } else
55+ {
56+ // and otherwise fall back to for loops and scalar operations, comparing one float at a time
57+ ProcessScalar ( ptr , data . Length , dimensions , min , max ) ;
58+ }
59+ }
60+ }
61+
62+ // ReSharper disable once InconsistentNaming
63+ private static unsafe void ProcessWithAVX ( float * ptr , int length , int dimensions , float [ ] min , float [ ] max )
64+ {
65+ var minVecs = new Vector256 < float > [ dimensions ] ;
66+ var maxVecs = new Vector256 < float > [ dimensions ] ;
67+
68+ for ( int d = 0 ; d < dimensions ; d ++ )
69+ {
70+ minVecs [ d ] = Vector256 . Create ( float . MaxValue ) ;
71+ maxVecs [ d ] = Vector256 . Create ( float . MinValue ) ;
72+ }
73+
74+ int i = 0 ;
75+ int vectorizedLength = length - ( length % ( dimensions * 8 ) ) ;
76+
77+ for ( ; i < vectorizedLength ; i += dimensions * 8 )
78+ {
79+ for ( int d = 0 ; d < dimensions ; d ++ )
80+ {
81+ var vec = Avx . LoadVector256 ( ptr + i + d * 8 ) ;
82+ minVecs [ d ] = Avx . Min ( minVecs [ d ] , vec ) ;
83+ maxVecs [ d ] = Avx . Max ( maxVecs [ d ] , vec ) ;
84+ }
85+ }
86+
87+ var temp = stackalloc float [ 8 ] ;
88+ for ( int d = 0 ; d < dimensions ; d ++ )
89+ {
90+ Avx . Store ( temp , minVecs [ d ] ) ;
91+ for ( int j = 0 ; j < 8 ; j ++ )
92+ {
93+ min [ d ] = Math . Min ( min [ d ] , temp [ j ] ) ;
94+ }
95+
96+ Avx . Store ( temp , maxVecs [ d ] ) ;
97+ for ( int j = 0 ; j < 8 ; j ++ )
98+ {
99+ max [ d ] = Math . Max ( max [ d ] , temp [ j ] ) ;
100+ }
101+ }
102+
103+ ProcessRemainingElements ( ptr , i , length , dimensions , min , max ) ;
104+ }
105+
106+ private static unsafe void ProcessWithVector ( float * ptr , int length , int dimensions , float [ ] min , float [ ] max )
107+ {
108+ var minVecs = new Vector < float > [ dimensions ] ;
109+ var maxVecs = new Vector < float > [ dimensions ] ;
110+ int vectorSize = Vector < float > . Count ;
111+
112+ for ( int d = 0 ; d < dimensions ; d ++ )
113+ {
114+ minVecs [ d ] = new Vector < float > ( float . MaxValue ) ;
115+ maxVecs [ d ] = new Vector < float > ( float . MinValue ) ;
116+ }
117+
118+ int i = 0 ;
119+ int vectorizedLength = length - ( length % ( dimensions * vectorSize ) ) ;
120+
121+ // Main vectorized loop
122+ for ( ; i < vectorizedLength ; i += dimensions * vectorSize )
123+ {
124+ for ( int d = 0 ; d < dimensions ; d ++ )
125+ {
126+ var span = new ReadOnlySpan < float > ( ptr + i + d * vectorSize , vectorSize ) ;
127+ var vec = new Vector < float > ( span ) ;
128+ minVecs [ d ] = Vector . Min ( minVecs [ d ] , vec ) ;
129+ maxVecs [ d ] = Vector . Max ( maxVecs [ d ] , vec ) ;
130+ }
131+ }
132+
133+ // Reduce vectors to scalar values
134+ for ( int d = 0 ; d < dimensions ; d ++ )
135+ {
136+ min [ d ] = float . MaxValue ;
137+ max [ d ] = float . MinValue ;
138+
139+ for ( int j = 0 ; j < vectorSize ; j ++ )
140+ {
141+ min [ d ] = Math . Min ( min [ d ] , minVecs [ d ] [ j ] ) ;
142+ max [ d ] = Math . Max ( max [ d ] , maxVecs [ d ] [ j ] ) ;
143+ }
144+ }
145+
146+ ProcessRemainingElements ( ptr , i , length , dimensions , min , max ) ;
147+ }
148+
149+ // ReSharper disable once InconsistentNaming
150+ private static unsafe void ProcessSIMD3D ( ReadOnlySpan < float > data , float [ ] min , float [ ] max )
151+ {
152+ fixed ( float * ptr = data )
153+ {
154+ if ( Avx2 . IsSupported && data . Length >= 24 )
155+ {
156+ ProcessWithAVX3D ( ptr , data . Length , min , max ) ;
157+ } else if ( Vector . IsHardwareAccelerated && data . Length >= 12 )
158+ {
159+ ProcessWithVector3D ( ptr , data . Length , min , max ) ;
160+ } else
161+ {
162+ ProcessScalar ( ptr , data . Length , 3 , min , max ) ;
163+ }
164+ }
165+ }
166+
167+
168+ // ReSharper disable once InconsistentNaming
169+ private static unsafe void ProcessWithAVX3D ( float * ptr , int length , float [ ] min , float [ ] max )
170+ {
171+ // Initialize vectors for each dimension
172+ var min0 = Vector256 . Create ( float . MaxValue ) ;
173+ var min1 = Vector256 . Create ( float . MaxValue ) ;
174+ var min2 = Vector256 . Create ( float . MaxValue ) ;
175+
176+ var max0 = Vector256 . Create ( float . MinValue ) ;
177+ var max1 = Vector256 . Create ( float . MinValue ) ;
178+ var max2 = Vector256 . Create ( float . MinValue ) ;
179+
180+ int i = 0 ;
181+ int vectorizedLength = length - ( length % 24 ) ; // Process in chunks of 24 floats (8 vectors × 3 dimensions)
182+
183+ // Main processing loop - handles 8 vectors at a time
184+ for ( ; i < vectorizedLength ; i += 24 )
185+ {
186+ var c0 = Avx . LoadVector256 ( ptr + i ) ;
187+ min0 = Avx . Min ( min0 , c0 ) ;
188+ max0 = Avx . Max ( max0 , c0 ) ;
189+
190+ var c1 = Avx . LoadVector256 ( ptr + i + 8 ) ;
191+ min1 = Avx . Min ( min1 , c1 ) ;
192+ max1 = Avx . Max ( max1 , c1 ) ;
193+
194+ var c2 = Avx . LoadVector256 ( ptr + i + 16 ) ;
195+ min2 = Avx . Min ( min2 , c2 ) ;
196+ max2 = Avx . Max ( max2 , c2 ) ;
197+ }
198+
199+ // Reduce the vectors to scalar values
200+ var temp = stackalloc float [ 8 ] ;
201+
202+ // Process min values
203+ Avx . Store ( temp , min0 ) ;
204+ min [ 0 ] = temp [ 0 ] ;
205+ for ( int j = 1 ; j < 8 ; j ++ ) min [ 0 ] = Math . Min ( min [ 0 ] , temp [ j ] ) ;
206+
207+ Avx . Store ( temp , min1 ) ;
208+ min [ 1 ] = temp [ 0 ] ;
209+ for ( int j = 1 ; j < 8 ; j ++ ) min [ 1 ] = Math . Min ( min [ 1 ] , temp [ j ] ) ;
210+
211+ Avx . Store ( temp , min2 ) ;
212+ min [ 2 ] = temp [ 0 ] ;
213+ for ( int j = 1 ; j < 8 ; j ++ ) min [ 2 ] = Math . Min ( min [ 2 ] , temp [ j ] ) ;
214+
215+ // Process max values
216+ Avx . Store ( temp , max0 ) ;
217+ max [ 0 ] = temp [ 0 ] ;
218+ for ( int j = 1 ; j < 8 ; j ++ ) max [ 0 ] = Math . Max ( max [ 0 ] , temp [ j ] ) ;
219+
220+ Avx . Store ( temp , max1 ) ;
221+ max [ 1 ] = temp [ 0 ] ;
222+ for ( int j = 1 ; j < 8 ; j ++ ) max [ 1 ] = Math . Max ( max [ 1 ] , temp [ j ] ) ;
223+
224+ Avx . Store ( temp , max2 ) ;
225+ max [ 2 ] = temp [ 0 ] ;
226+ for ( int j = 1 ; j < 8 ; j ++ ) max [ 2 ] = Math . Max ( max [ 2 ] , temp [ j ] ) ;
227+
228+ // Process remaining elements
229+ ProcessRemainingElements ( ptr , i , length , 3 , min , max ) ;
230+ }
231+
232+ private static unsafe void ProcessWithVector3D ( float * ptr , int length , float [ ] min , float [ ] max )
233+ {
234+ int vectorSize = Vector < float > . Count ;
235+
236+ // Initialize vectors for each dimension
237+ var min0 = new Vector < float > ( float . MaxValue ) ;
238+ var min1 = new Vector < float > ( float . MaxValue ) ;
239+ var min2 = new Vector < float > ( float . MaxValue ) ;
240+
241+ var max0 = new Vector < float > ( float . MinValue ) ;
242+ var max1 = new Vector < float > ( float . MinValue ) ;
243+ var max2 = new Vector < float > ( float . MinValue ) ;
244+
245+ int i = 0 ;
246+ int vectorizedLength = length - ( length % ( 3 * vectorSize ) ) ;
247+
248+ // Main processing loop
249+ for ( ; i < vectorizedLength ; i += 3 * vectorSize )
250+ {
251+ var vec0 = new Vector < float > ( new ReadOnlySpan < float > ( ptr + i , vectorSize ) ) ;
252+ min0 = Vector . Min ( min0 , vec0 ) ;
253+ max0 = Vector . Max ( max0 , vec0 ) ;
254+
255+ var vec1 = new Vector < float > ( new ReadOnlySpan < float > ( ptr + i + vectorSize , vectorSize ) ) ;
256+ min1 = Vector . Min ( min1 , vec1 ) ;
257+ max1 = Vector . Max ( max1 , vec1 ) ;
258+
259+ var vec2 = new Vector < float > ( new ReadOnlySpan < float > ( ptr + i + 2 * vectorSize , vectorSize ) ) ;
260+ min2 = Vector . Min ( min2 , vec2 ) ;
261+ max2 = Vector . Max ( max2 , vec2 ) ;
262+ }
263+
264+ // Reduce vectors to scalar values
265+ min [ 0 ] = float . MaxValue ;
266+ min [ 1 ] = float . MaxValue ;
267+ min [ 2 ] = float . MaxValue ;
268+ max [ 0 ] = float . MinValue ;
269+ max [ 1 ] = float . MinValue ;
270+ max [ 2 ] = float . MinValue ;
271+
272+ for ( int j = 0 ; j < Vector < float > . Count ; j ++ )
273+ {
274+ min [ 0 ] = Math . Min ( min [ 0 ] , min0 [ j ] ) ;
275+ min [ 1 ] = Math . Min ( min [ 1 ] , min1 [ j ] ) ;
276+ min [ 2 ] = Math . Min ( min [ 2 ] , min2 [ j ] ) ;
277+
278+ max [ 0 ] = Math . Max ( max [ 0 ] , max0 [ j ] ) ;
279+ max [ 1 ] = Math . Max ( max [ 1 ] , max1 [ j ] ) ;
280+ max [ 2 ] = Math . Max ( max [ 2 ] , max2 [ j ] ) ;
281+ }
282+
283+ // Process remaining elements
284+ ProcessRemainingElements ( ptr , i , length , 3 , min , max ) ;
285+ }
286+
287+ private static unsafe void ProcessScalar ( float * ptr , int length , int dimensions , float [ ] min , float [ ] max )
288+ {
289+ for ( int i = 0 ; i < length ; i += dimensions )
290+ {
291+ for ( int d = 0 ; d < dimensions ; d ++ )
292+ {
293+ min [ d ] = Math . Min ( min [ d ] , ptr [ i + d ] ) ;
294+ max [ d ] = Math . Max ( max [ d ] , ptr [ i + d ] ) ;
295+ }
296+ }
297+ }
298+
299+ private static unsafe void ProcessRemainingElements ( float * ptr , int start , int length , int dimensions , float [ ] min , float [ ] max )
300+ {
301+ for ( int i = start ; i < length ; i += dimensions )
302+ {
303+ for ( int d = 0 ; d < dimensions ; d ++ )
304+ {
305+ min [ d ] = Math . Min ( min [ d ] , ptr [ i + d ] ) ;
306+ max [ d ] = Math . Max ( max [ d ] , ptr [ i + d ] ) ;
307+ }
308+ }
309+ }
310+ }
311+ }
312+ #endif
0 commit comments