@@ -7,13 +7,14 @@ namespace SharpGLTF.Schema2
77{
88 /// <summary>
99 /// Somewhat optimized version of finding min/max values in a vector of floats. Please note some effort
10- /// has been made to test a multi threaded version of this as well but it was not faster than this implementation
10+ /// has been made to test a multithreaded version of this as well, but it was not faster than this implementation
1111 /// for the data sets it was tested against. If anybody feels so inclined, please feel free to try and improve
1212 /// this further.
1313 /// </summary>
1414 public static class VectorMinMax
1515 {
16- public static ( float [ ] min , float [ ] max ) FindMinMax ( ReadOnlySpan < float > data , int dimensions ) {
16+ public static ( float [ ] min , float [ ] max ) FindMinMax ( ReadOnlySpan < float > data , int dimensions )
17+ {
1718 if ( data . Length % dimensions != 0 )
1819 throw new ArgumentException ( $ "Data length must be divisible by { dimensions } ") ;
1920
@@ -22,71 +23,93 @@ public static (float[] min, float[] max) FindMinMax(ReadOnlySpan<float> data, in
2223 Array . Fill ( min , float . MaxValue ) ;
2324 Array . Fill ( max , float . MinValue ) ;
2425
25- // Just use SIMD without parallelization for each individual call
26- ProcessSIMD ( data , dimensions , min , max ) ;
26+
27+ if ( dimensions == 3 && data . Length >= 24 )
28+ {
29+ // Special optimized path for 3D vectors
30+ ProcessSIMD3D ( data , min , max ) ;
31+ } else
32+ {
33+ // General case for other dimensions
34+ ProcessSIMD ( data , dimensions , min , max ) ;
35+ }
2736
2837 return ( min , max ) ;
2938 }
3039
3140 // ReSharper disable once InconsistentNaming
32- private static unsafe void ProcessSIMD ( ReadOnlySpan < float > data , int dimensions , float [ ] min , float [ ] max ) {
33- fixed ( float * ptr = data ) {
34- if ( Avx2 . IsSupported && data . Length >= dimensions * 8 ) {
41+ private static unsafe void ProcessSIMD ( ReadOnlySpan < float > data , int dimensions , float [ ] min , float [ ] max )
42+ {
43+ fixed ( float * ptr = data )
44+ {
45+ if ( Avx2 . IsSupported && data . Length >= dimensions * 8 )
46+ {
3547 // intel processors, 8 floats = 256 bits
3648 ProcessWithAVX ( ptr , data . Length , dimensions , min , max ) ;
37- } else if ( Vector . IsHardwareAccelerated && data . Length >= dimensions * Vector < float > . Count ) {
49+ } else if ( Vector . IsHardwareAccelerated && data . Length >= dimensions * Vector < float > . Count )
50+ {
3851 // on arm / apple silicon etc, Vector<float>.Count usually == 4. 4 floats = 128 bits
3952 ProcessWithVector ( ptr , data . Length , dimensions , min , max ) ;
40- } else {
53+ } else
54+ {
4155 // and otherwise fall back to for loops and scalar operations, comparing one float at a time
4256 ProcessScalar ( ptr , data . Length , dimensions , min , max ) ;
4357 }
4458 }
4559 }
4660
4761 // ReSharper disable once InconsistentNaming
48- private static unsafe void ProcessWithAVX ( float * ptr , int length , int dimensions , float [ ] min , float [ ] max ) {
62+ private static unsafe void ProcessWithAVX ( float * ptr , int length , int dimensions , float [ ] min , float [ ] max )
63+ {
4964 var minVecs = new Vector256 < float > [ dimensions ] ;
5065 var maxVecs = new Vector256 < float > [ dimensions ] ;
5166
52- for ( int d = 0 ; d < dimensions ; d ++ ) {
67+ for ( int d = 0 ; d < dimensions ; d ++ )
68+ {
5369 minVecs [ d ] = Vector256 . Create ( float . MaxValue ) ;
5470 maxVecs [ d ] = Vector256 . Create ( float . MinValue ) ;
5571 }
5672
5773 int i = 0 ;
5874 int vectorizedLength = length - ( length % ( dimensions * 8 ) ) ;
5975
60- for ( ; i < vectorizedLength ; i += dimensions * 8 ) {
61- for ( int d = 0 ; d < dimensions ; d ++ ) {
76+ for ( ; i < vectorizedLength ; i += dimensions * 8 )
77+ {
78+ for ( int d = 0 ; d < dimensions ; d ++ )
79+ {
6280 var vec = Avx . LoadVector256 ( ptr + i + d * 8 ) ;
6381 minVecs [ d ] = Avx . Min ( minVecs [ d ] , vec ) ;
6482 maxVecs [ d ] = Avx . Max ( maxVecs [ d ] , vec ) ;
6583 }
6684 }
6785
6886 var temp = stackalloc float [ 8 ] ;
69- for ( int d = 0 ; d < dimensions ; d ++ ) {
87+ for ( int d = 0 ; d < dimensions ; d ++ )
88+ {
7089 Avx . Store ( temp , minVecs [ d ] ) ;
71- for ( int j = 0 ; j < 8 ; j ++ ) {
90+ for ( int j = 0 ; j < 8 ; j ++ )
91+ {
7292 min [ d ] = Math . Min ( min [ d ] , temp [ j ] ) ;
7393 }
7494
7595 Avx . Store ( temp , maxVecs [ d ] ) ;
76- for ( int j = 0 ; j < 8 ; j ++ ) {
96+ for ( int j = 0 ; j < 8 ; j ++ )
97+ {
7798 max [ d ] = Math . Max ( max [ d ] , temp [ j ] ) ;
7899 }
79100 }
80101
81102 ProcessRemainingElements ( ptr , i , length , dimensions , min , max ) ;
82103 }
83104
84- private static unsafe void ProcessWithVector ( float * ptr , int length , int dimensions , float [ ] min , float [ ] max ) {
105+ private static unsafe void ProcessWithVector ( float * ptr , int length , int dimensions , float [ ] min , float [ ] max )
106+ {
85107 var minVecs = new Vector < float > [ dimensions ] ;
86108 var maxVecs = new Vector < float > [ dimensions ] ;
87109 int vectorSize = Vector < float > . Count ;
88110
89- for ( int d = 0 ; d < dimensions ; d ++ ) {
111+ for ( int d = 0 ; d < dimensions ; d ++ )
112+ {
90113 minVecs [ d ] = new Vector < float > ( float . MaxValue ) ;
91114 maxVecs [ d ] = new Vector < float > ( float . MinValue ) ;
92115 }
@@ -95,8 +118,10 @@ private static unsafe void ProcessWithVector(float* ptr, int length, int dimensi
95118 int vectorizedLength = length - ( length % ( dimensions * vectorSize ) ) ;
96119
97120 // Main vectorized loop
98- for ( ; i < vectorizedLength ; i += dimensions * vectorSize ) {
99- for ( int d = 0 ; d < dimensions ; d ++ ) {
121+ for ( ; i < vectorizedLength ; i += dimensions * vectorSize )
122+ {
123+ for ( int d = 0 ; d < dimensions ; d ++ )
124+ {
100125 var span = new ReadOnlySpan < float > ( ptr + i + d * vectorSize , vectorSize ) ;
101126 var vec = new Vector < float > ( span ) ;
102127 minVecs [ d ] = Vector . Min ( minVecs [ d ] , vec ) ;
@@ -105,11 +130,13 @@ private static unsafe void ProcessWithVector(float* ptr, int length, int dimensi
105130 }
106131
107132 // Reduce vectors to scalar values
108- for ( int d = 0 ; d < dimensions ; d ++ ) {
133+ for ( int d = 0 ; d < dimensions ; d ++ )
134+ {
109135 min [ d ] = float . MaxValue ;
110136 max [ d ] = float . MinValue ;
111137
112- for ( int j = 0 ; j < vectorSize ; j ++ ) {
138+ for ( int j = 0 ; j < vectorSize ; j ++ )
139+ {
113140 min [ d ] = Math . Min ( min [ d ] , minVecs [ d ] [ j ] ) ;
114141 max [ d ] = Math . Max ( max [ d ] , maxVecs [ d ] [ j ] ) ;
115142 }
@@ -118,18 +145,162 @@ private static unsafe void ProcessWithVector(float* ptr, int length, int dimensi
118145 ProcessRemainingElements ( ptr , i , length , dimensions , min , max ) ;
119146 }
120147
121- private static unsafe void ProcessScalar ( float * ptr , int length , int dimensions , float [ ] min , float [ ] max ) {
122- for ( int i = 0 ; i < length ; i += dimensions ) {
123- for ( int d = 0 ; d < dimensions ; d ++ ) {
148+ // ReSharper disable once InconsistentNaming
149+ private static unsafe void ProcessSIMD3D ( ReadOnlySpan < float > data , float [ ] min , float [ ] max )
150+ {
151+ fixed ( float * ptr = data )
152+ {
153+ if ( Avx2 . IsSupported && data . Length >= 24 )
154+ {
155+ ProcessWithAVX3D ( ptr , data . Length , min , max ) ;
156+ } else if ( Vector . IsHardwareAccelerated && data . Length >= 12 )
157+ {
158+ ProcessWithVector3D ( ptr , data . Length , min , max ) ;
159+ } else
160+ {
161+ ProcessScalar ( ptr , data . Length , 3 , min , max ) ;
162+ }
163+ }
164+ }
165+
166+
167+ // ReSharper disable once InconsistentNaming
168+ private static unsafe void ProcessWithAVX3D ( float * ptr , int length , float [ ] min , float [ ] max )
169+ {
170+ // Initialize vectors for each dimension
171+ var min0 = Vector256 . Create ( float . MaxValue ) ;
172+ var min1 = Vector256 . Create ( float . MaxValue ) ;
173+ var min2 = Vector256 . Create ( float . MaxValue ) ;
174+
175+ var max0 = Vector256 . Create ( float . MinValue ) ;
176+ var max1 = Vector256 . Create ( float . MinValue ) ;
177+ var max2 = Vector256 . Create ( float . MinValue ) ;
178+
179+ int i = 0 ;
180+ int vectorizedLength = length - ( length % 24 ) ; // Process in chunks of 24 floats (8 vectors × 3 dimensions)
181+
182+ // Main processing loop - handles 8 vectors at a time
183+ for ( ; i < vectorizedLength ; i += 24 )
184+ {
185+ var c0 = Avx . LoadVector256 ( ptr + i ) ;
186+ min0 = Avx . Min ( min0 , c0 ) ;
187+ max0 = Avx . Max ( max0 , c0 ) ;
188+
189+ var c1 = Avx . LoadVector256 ( ptr + i + 8 ) ;
190+ min1 = Avx . Min ( min1 , c1 ) ;
191+ max1 = Avx . Max ( max1 , c1 ) ;
192+
193+ var c2 = Avx . LoadVector256 ( ptr + i + 16 ) ;
194+ min2 = Avx . Min ( min2 , c2 ) ;
195+ max2 = Avx . Max ( max2 , c2 ) ;
196+ }
197+
198+ // Reduce the vectors to scalar values
199+ var temp = stackalloc float [ 8 ] ;
200+
201+ // Process min values
202+ Avx . Store ( temp , min0 ) ;
203+ min [ 0 ] = temp [ 0 ] ;
204+ for ( int j = 1 ; j < 8 ; j ++ ) min [ 0 ] = Math . Min ( min [ 0 ] , temp [ j ] ) ;
205+
206+ Avx . Store ( temp , min1 ) ;
207+ min [ 1 ] = temp [ 0 ] ;
208+ for ( int j = 1 ; j < 8 ; j ++ ) min [ 1 ] = Math . Min ( min [ 1 ] , temp [ j ] ) ;
209+
210+ Avx . Store ( temp , min2 ) ;
211+ min [ 2 ] = temp [ 0 ] ;
212+ for ( int j = 1 ; j < 8 ; j ++ ) min [ 2 ] = Math . Min ( min [ 2 ] , temp [ j ] ) ;
213+
214+ // Process max values
215+ Avx . Store ( temp , max0 ) ;
216+ max [ 0 ] = temp [ 0 ] ;
217+ for ( int j = 1 ; j < 8 ; j ++ ) max [ 0 ] = Math . Max ( max [ 0 ] , temp [ j ] ) ;
218+
219+ Avx . Store ( temp , max1 ) ;
220+ max [ 1 ] = temp [ 0 ] ;
221+ for ( int j = 1 ; j < 8 ; j ++ ) max [ 1 ] = Math . Max ( max [ 1 ] , temp [ j ] ) ;
222+
223+ Avx . Store ( temp , max2 ) ;
224+ max [ 2 ] = temp [ 0 ] ;
225+ for ( int j = 1 ; j < 8 ; j ++ ) max [ 2 ] = Math . Max ( max [ 2 ] , temp [ j ] ) ;
226+
227+ // Process remaining elements
228+ ProcessRemainingElements ( ptr , i , length , 3 , min , max ) ;
229+ }
230+
231+ private static unsafe void ProcessWithVector3D ( float * ptr , int length , float [ ] min , float [ ] max )
232+ {
233+ int vectorSize = Vector < float > . Count ;
234+
235+ // Initialize vectors for each dimension
236+ var min0 = new Vector < float > ( float . MaxValue ) ;
237+ var min1 = new Vector < float > ( float . MaxValue ) ;
238+ var min2 = new Vector < float > ( float . MaxValue ) ;
239+
240+ var max0 = new Vector < float > ( float . MinValue ) ;
241+ var max1 = new Vector < float > ( float . MinValue ) ;
242+ var max2 = new Vector < float > ( float . MinValue ) ;
243+
244+ int i = 0 ;
245+ int vectorizedLength = length - ( length % ( 3 * vectorSize ) ) ;
246+
247+ // Main processing loop
248+ for ( ; i < vectorizedLength ; i += 3 * vectorSize )
249+ {
250+ var vec0 = new Vector < float > ( new ReadOnlySpan < float > ( ptr + i , vectorSize ) ) ;
251+ min0 = Vector . Min ( min0 , vec0 ) ;
252+ max0 = Vector . Max ( max0 , vec0 ) ;
253+
254+ var vec1 = new Vector < float > ( new ReadOnlySpan < float > ( ptr + i + vectorSize , vectorSize ) ) ;
255+ min1 = Vector . Min ( min1 , vec1 ) ;
256+ max1 = Vector . Max ( max1 , vec1 ) ;
257+
258+ var vec2 = new Vector < float > ( new ReadOnlySpan < float > ( ptr + i + 2 * vectorSize , vectorSize ) ) ;
259+ min2 = Vector . Min ( min2 , vec2 ) ;
260+ max2 = Vector . Max ( max2 , vec2 ) ;
261+ }
262+
263+ // Reduce vectors to scalar values
264+ min [ 0 ] = float . MaxValue ;
265+ min [ 1 ] = float . MaxValue ;
266+ min [ 2 ] = float . MaxValue ;
267+ max [ 0 ] = float . MinValue ;
268+ max [ 1 ] = float . MinValue ;
269+ max [ 2 ] = float . MinValue ;
270+
271+ for ( int j = 0 ; j < Vector < float > . Count ; j ++ )
272+ {
273+ min [ 0 ] = Math . Min ( min [ 0 ] , min0 [ j ] ) ;
274+ min [ 1 ] = Math . Min ( min [ 1 ] , min1 [ j ] ) ;
275+ min [ 2 ] = Math . Min ( min [ 2 ] , min2 [ j ] ) ;
276+
277+ max [ 0 ] = Math . Max ( max [ 0 ] , max0 [ j ] ) ;
278+ max [ 1 ] = Math . Max ( max [ 1 ] , max1 [ j ] ) ;
279+ max [ 2 ] = Math . Max ( max [ 2 ] , max2 [ j ] ) ;
280+ }
281+
282+ // Process remaining elements
283+ ProcessRemainingElements ( ptr , i , length , 3 , min , max ) ;
284+ }
285+
286+ private static unsafe void ProcessScalar ( float * ptr , int length , int dimensions , float [ ] min , float [ ] max )
287+ {
288+ for ( int i = 0 ; i < length ; i += dimensions )
289+ {
290+ for ( int d = 0 ; d < dimensions ; d ++ )
291+ {
124292 min [ d ] = Math . Min ( min [ d ] , ptr [ i + d ] ) ;
125293 max [ d ] = Math . Max ( max [ d ] , ptr [ i + d ] ) ;
126294 }
127295 }
128296 }
129297
130- private static unsafe void ProcessRemainingElements ( float * ptr , int start , int length , int dimensions , float [ ] min , float [ ] max ) {
131- for ( int i = start ; i < length ; i += dimensions ) {
132- for ( int d = 0 ; d < dimensions ; d ++ ) {
298+ private static unsafe void ProcessRemainingElements ( float * ptr , int start , int length , int dimensions , float [ ] min , float [ ] max )
299+ {
300+ for ( int i = start ; i < length ; i += dimensions )
301+ {
302+ for ( int d = 0 ; d < dimensions ; d ++ )
303+ {
133304 min [ d ] = Math . Min ( min [ d ] , ptr [ i + d ] ) ;
134305 max [ d ] = Math . Max ( max [ d ] , ptr [ i + d ] ) ;
135306 }
0 commit comments