Skip to content

Commit 8e721da

Browse files
Merge pull request #9 from mbjarland/vectorized-minmax-fix
Vectorized minmax fix
2 parents aa50e05 + d6b9256 commit 8e721da

File tree

4 files changed

+355
-13
lines changed

4 files changed

+355
-13
lines changed

.gitignore

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
##
44
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
55

6+
# Apple
7+
.DS_Store
8+
69
# User-specific files
710
*.suo
811
*.user
@@ -221,7 +224,7 @@ ClientBin/
221224
*.publishsettings
222225
orleans.codegen.cs
223226

224-
# Including strong name files can present a security risk
227+
# Including strong name files can present a security risk
225228
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
226229
#*.snk
227230

@@ -317,7 +320,7 @@ __pycache__/
317320
# OpenCover UI analysis results
318321
OpenCover/
319322

320-
# Azure Stream Analytics local run output
323+
# Azure Stream Analytics local run output
321324
ASALocalRun/
322325

323326
# MSBuild Binary and Structured Log
@@ -326,6 +329,6 @@ ASALocalRun/
326329
# NVidia Nsight GPU debugger configuration file
327330
*.nvuser
328331

329-
# MFractors (Xamarin productivity tool) working folder
332+
# MFractors (Xamarin productivity tool) working folder
330333
.mfractor/
331334
/tests/TestFiles

src/.DS_Store

6 KB
Binary file not shown.
Lines changed: 312 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,312 @@
1+
#if NET6_0_OR_GREATER
2+
using System;
3+
using System.Numerics;
4+
using System.Runtime.Intrinsics;
5+
using System.Runtime.Intrinsics.X86;
6+
7+
namespace SharpGLTF.Schema2
8+
{
9+
/// <summary>
10+
/// Somewhat optimized version of finding min/max values in a vector of floats. Please note some effort
11+
/// has been made to test a multithreaded version of this as well, but it was not faster than this implementation
12+
/// for the data sets it was tested against. If anybody feels so inclined, please feel free to try and improve
13+
/// this further.
14+
/// </summary>
15+
public static class VectorMinMax
16+
{
17+
public static (float[] min, float[] max) FindMinMax(ReadOnlySpan<float> data, int dimensions)
18+
{
19+
if (data.Length % dimensions != 0)
20+
throw new ArgumentException($"Data length must be divisible by {dimensions}");
21+
22+
var min = new float[dimensions];
23+
var max = new float[dimensions];
24+
Array.Fill(min, float.MaxValue);
25+
Array.Fill(max, float.MinValue);
26+
27+
28+
if (dimensions == 3 && data.Length >= 24)
29+
{
30+
// Special optimized path for 3D vectors
31+
ProcessSIMD3D(data, min, max);
32+
} else
33+
{
34+
// General case for other dimensions
35+
ProcessSIMD(data, dimensions, min, max);
36+
}
37+
38+
return (min, max);
39+
}
40+
41+
// ReSharper disable once InconsistentNaming
42+
private static unsafe void ProcessSIMD(ReadOnlySpan<float> data, int dimensions, float[] min, float[] max)
43+
{
44+
fixed (float* ptr = data)
45+
{
46+
if (Avx2.IsSupported && data.Length >= dimensions * 8)
47+
{
48+
// intel processors, 8 floats = 256 bits
49+
ProcessWithAVX(ptr, data.Length, dimensions, min, max);
50+
} else if (Vector.IsHardwareAccelerated && data.Length >= dimensions * Vector<float>.Count)
51+
{
52+
// on arm / apple silicon etc, Vector<float>.Count usually == 4. 4 floats = 128 bits
53+
ProcessWithVector(ptr, data.Length, dimensions, min, max);
54+
} else
55+
{
56+
// and otherwise fall back to for loops and scalar operations, comparing one float at a time
57+
ProcessScalar(ptr, data.Length, dimensions, min, max);
58+
}
59+
}
60+
}
61+
62+
// ReSharper disable once InconsistentNaming
63+
private static unsafe void ProcessWithAVX(float* ptr, int length, int dimensions, float[] min, float[] max)
64+
{
65+
var minVecs = new Vector256<float>[dimensions];
66+
var maxVecs = new Vector256<float>[dimensions];
67+
68+
for (int d = 0; d < dimensions; d++)
69+
{
70+
minVecs[d] = Vector256.Create(float.MaxValue);
71+
maxVecs[d] = Vector256.Create(float.MinValue);
72+
}
73+
74+
int i = 0;
75+
int vectorizedLength = length - (length % (dimensions * 8));
76+
77+
for (; i < vectorizedLength; i += dimensions * 8)
78+
{
79+
for (int d = 0; d < dimensions; d++)
80+
{
81+
var vec = Avx.LoadVector256(ptr + i + d * 8);
82+
minVecs[d] = Avx.Min(minVecs[d], vec);
83+
maxVecs[d] = Avx.Max(maxVecs[d], vec);
84+
}
85+
}
86+
87+
var temp = stackalloc float[8];
88+
for (int d = 0; d < dimensions; d++)
89+
{
90+
Avx.Store(temp, minVecs[d]);
91+
for (int j = 0; j < 8; j++)
92+
{
93+
min[d] = Math.Min(min[d], temp[j]);
94+
}
95+
96+
Avx.Store(temp, maxVecs[d]);
97+
for (int j = 0; j < 8; j++)
98+
{
99+
max[d] = Math.Max(max[d], temp[j]);
100+
}
101+
}
102+
103+
ProcessRemainingElements(ptr, i, length, dimensions, min, max);
104+
}
105+
106+
private static unsafe void ProcessWithVector(float* ptr, int length, int dimensions, float[] min, float[] max)
107+
{
108+
var minVecs = new Vector<float>[dimensions];
109+
var maxVecs = new Vector<float>[dimensions];
110+
int vectorSize = Vector<float>.Count;
111+
112+
for (int d = 0; d < dimensions; d++)
113+
{
114+
minVecs[d] = new Vector<float>(float.MaxValue);
115+
maxVecs[d] = new Vector<float>(float.MinValue);
116+
}
117+
118+
int i = 0;
119+
int vectorizedLength = length - (length % (dimensions * vectorSize));
120+
121+
// Main vectorized loop
122+
for (; i < vectorizedLength; i += dimensions * vectorSize)
123+
{
124+
for (int d = 0; d < dimensions; d++)
125+
{
126+
var span = new ReadOnlySpan<float>(ptr + i + d * vectorSize, vectorSize);
127+
var vec = new Vector<float>(span);
128+
minVecs[d] = Vector.Min(minVecs[d], vec);
129+
maxVecs[d] = Vector.Max(maxVecs[d], vec);
130+
}
131+
}
132+
133+
// Reduce vectors to scalar values
134+
for (int d = 0; d < dimensions; d++)
135+
{
136+
min[d] = float.MaxValue;
137+
max[d] = float.MinValue;
138+
139+
for (int j = 0; j < vectorSize; j++)
140+
{
141+
min[d] = Math.Min(min[d], minVecs[d][j]);
142+
max[d] = Math.Max(max[d], maxVecs[d][j]);
143+
}
144+
}
145+
146+
ProcessRemainingElements(ptr, i, length, dimensions, min, max);
147+
}
148+
149+
// ReSharper disable once InconsistentNaming
150+
private static unsafe void ProcessSIMD3D(ReadOnlySpan<float> data, float[] min, float[] max)
151+
{
152+
fixed (float* ptr = data)
153+
{
154+
if (Avx2.IsSupported && data.Length >= 24)
155+
{
156+
ProcessWithAVX3D(ptr, data.Length, min, max);
157+
} else if (Vector.IsHardwareAccelerated && data.Length >= 12)
158+
{
159+
ProcessWithVector3D(ptr, data.Length, min, max);
160+
} else
161+
{
162+
ProcessScalar(ptr, data.Length, 3, min, max);
163+
}
164+
}
165+
}
166+
167+
168+
// ReSharper disable once InconsistentNaming
169+
private static unsafe void ProcessWithAVX3D(float* ptr, int length, float[] min, float[] max)
170+
{
171+
// Initialize vectors for each dimension
172+
var min0 = Vector256.Create(float.MaxValue);
173+
var min1 = Vector256.Create(float.MaxValue);
174+
var min2 = Vector256.Create(float.MaxValue);
175+
176+
var max0 = Vector256.Create(float.MinValue);
177+
var max1 = Vector256.Create(float.MinValue);
178+
var max2 = Vector256.Create(float.MinValue);
179+
180+
int i = 0;
181+
int vectorizedLength = length - (length % 24); // Process in chunks of 24 floats (8 vectors × 3 dimensions)
182+
183+
// Main processing loop - handles 8 vectors at a time
184+
for (; i < vectorizedLength; i += 24)
185+
{
186+
var c0 = Avx.LoadVector256(ptr + i);
187+
min0 = Avx.Min(min0, c0);
188+
max0 = Avx.Max(max0, c0);
189+
190+
var c1 = Avx.LoadVector256(ptr + i + 8);
191+
min1 = Avx.Min(min1, c1);
192+
max1 = Avx.Max(max1, c1);
193+
194+
var c2 = Avx.LoadVector256(ptr + i + 16);
195+
min2 = Avx.Min(min2, c2);
196+
max2 = Avx.Max(max2, c2);
197+
}
198+
199+
// Reduce the vectors to scalar values
200+
var temp = stackalloc float[8];
201+
202+
// Process min values
203+
Avx.Store(temp, min0);
204+
min[0] = temp[0];
205+
for (int j = 1; j < 8; j++) min[0] = Math.Min(min[0], temp[j]);
206+
207+
Avx.Store(temp, min1);
208+
min[1] = temp[0];
209+
for (int j = 1; j < 8; j++) min[1] = Math.Min(min[1], temp[j]);
210+
211+
Avx.Store(temp, min2);
212+
min[2] = temp[0];
213+
for (int j = 1; j < 8; j++) min[2] = Math.Min(min[2], temp[j]);
214+
215+
// Process max values
216+
Avx.Store(temp, max0);
217+
max[0] = temp[0];
218+
for (int j = 1; j < 8; j++) max[0] = Math.Max(max[0], temp[j]);
219+
220+
Avx.Store(temp, max1);
221+
max[1] = temp[0];
222+
for (int j = 1; j < 8; j++) max[1] = Math.Max(max[1], temp[j]);
223+
224+
Avx.Store(temp, max2);
225+
max[2] = temp[0];
226+
for (int j = 1; j < 8; j++) max[2] = Math.Max(max[2], temp[j]);
227+
228+
// Process remaining elements
229+
ProcessRemainingElements(ptr, i, length, 3, min, max);
230+
}
231+
232+
private static unsafe void ProcessWithVector3D(float* ptr, int length, float[] min, float[] max)
233+
{
234+
int vectorSize = Vector<float>.Count;
235+
236+
// Initialize vectors for each dimension
237+
var min0 = new Vector<float>(float.MaxValue);
238+
var min1 = new Vector<float>(float.MaxValue);
239+
var min2 = new Vector<float>(float.MaxValue);
240+
241+
var max0 = new Vector<float>(float.MinValue);
242+
var max1 = new Vector<float>(float.MinValue);
243+
var max2 = new Vector<float>(float.MinValue);
244+
245+
int i = 0;
246+
int vectorizedLength = length - (length % (3 * vectorSize));
247+
248+
// Main processing loop
249+
for (; i < vectorizedLength; i += 3 * vectorSize)
250+
{
251+
var vec0 = new Vector<float>(new ReadOnlySpan<float>(ptr + i, vectorSize));
252+
min0 = Vector.Min(min0, vec0);
253+
max0 = Vector.Max(max0, vec0);
254+
255+
var vec1 = new Vector<float>(new ReadOnlySpan<float>(ptr + i + vectorSize, vectorSize));
256+
min1 = Vector.Min(min1, vec1);
257+
max1 = Vector.Max(max1, vec1);
258+
259+
var vec2 = new Vector<float>(new ReadOnlySpan<float>(ptr + i + 2 * vectorSize, vectorSize));
260+
min2 = Vector.Min(min2, vec2);
261+
max2 = Vector.Max(max2, vec2);
262+
}
263+
264+
// Reduce vectors to scalar values
265+
min[0] = float.MaxValue;
266+
min[1] = float.MaxValue;
267+
min[2] = float.MaxValue;
268+
max[0] = float.MinValue;
269+
max[1] = float.MinValue;
270+
max[2] = float.MinValue;
271+
272+
for (int j = 0; j < Vector<float>.Count; j++)
273+
{
274+
min[0] = Math.Min(min[0], min0[j]);
275+
min[1] = Math.Min(min[1], min1[j]);
276+
min[2] = Math.Min(min[2], min2[j]);
277+
278+
max[0] = Math.Max(max[0], max0[j]);
279+
max[1] = Math.Max(max[1], max1[j]);
280+
max[2] = Math.Max(max[2], max2[j]);
281+
}
282+
283+
// Process remaining elements
284+
ProcessRemainingElements(ptr, i, length, 3, min, max);
285+
}
286+
287+
private static unsafe void ProcessScalar(float* ptr, int length, int dimensions, float[] min, float[] max)
288+
{
289+
for (int i = 0; i < length; i += dimensions)
290+
{
291+
for (int d = 0; d < dimensions; d++)
292+
{
293+
min[d] = Math.Min(min[d], ptr[i + d]);
294+
max[d] = Math.Max(max[d], ptr[i + d]);
295+
}
296+
}
297+
}
298+
299+
private static unsafe void ProcessRemainingElements(float* ptr, int start, int length, int dimensions, float[] min, float[] max)
300+
{
301+
for (int i = start; i < length; i += dimensions)
302+
{
303+
for (int d = 0; d < dimensions; d++)
304+
{
305+
min[d] = Math.Min(min[d], ptr[i + d]);
306+
max[d] = Math.Max(max[d], ptr[i + d]);
307+
}
308+
}
309+
}
310+
}
311+
}
312+
#endif

0 commit comments

Comments
 (0)