Skip to content

Commit 1020115

Browse files
author
Matias Bjarland
committed
specialization for 3 dimensions - cuts down UpdateBounds time further
1 parent 749baef commit 1020115

File tree

5 files changed

+203
-28
lines changed

5 files changed

+203
-28
lines changed

src/.DS_Store

6 KB
Binary file not shown.

src/SharpGLTF.Core/Schema2/VectorMinMax.cs

Lines changed: 199 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,14 @@ namespace SharpGLTF.Schema2
77
{
88
/// <summary>
99
/// Somewhat optimized version of finding min/max values in a vector of floats. Please note some effort
10-
/// has been made to test a multi threaded version of this as well but it was not faster than this implementation
10+
/// has been made to test a multithreaded version of this as well, but it was not faster than this implementation
1111
/// for the data sets it was tested against. If anybody feels so inclined, please feel free to try and improve
1212
/// this further.
1313
/// </summary>
1414
public static class VectorMinMax
1515
{
16-
public static (float[] min, float[] max) FindMinMax(ReadOnlySpan<float> data, int dimensions) {
16+
public static (float[] min, float[] max) FindMinMax(ReadOnlySpan<float> data, int dimensions)
17+
{
1718
if (data.Length % dimensions != 0)
1819
throw new ArgumentException($"Data length must be divisible by {dimensions}");
1920

@@ -22,71 +23,93 @@ public static (float[] min, float[] max) FindMinMax(ReadOnlySpan<float> data, in
2223
Array.Fill(min, float.MaxValue);
2324
Array.Fill(max, float.MinValue);
2425

25-
// Just use SIMD without parallelization for each individual call
26-
ProcessSIMD(data, dimensions, min, max);
26+
27+
if (dimensions == 3 && data.Length >= 24)
28+
{
29+
// Special optimized path for 3D vectors
30+
ProcessSIMD3D(data, min, max);
31+
} else
32+
{
33+
// General case for other dimensions
34+
ProcessSIMD(data, dimensions, min, max);
35+
}
2736

2837
return (min, max);
2938
}
3039

3140
// ReSharper disable once InconsistentNaming
32-
private static unsafe void ProcessSIMD(ReadOnlySpan<float> data, int dimensions, float[] min, float[] max) {
33-
fixed (float* ptr = data) {
34-
if (Avx2.IsSupported && data.Length >= dimensions * 8) {
41+
private static unsafe void ProcessSIMD(ReadOnlySpan<float> data, int dimensions, float[] min, float[] max)
42+
{
43+
fixed (float* ptr = data)
44+
{
45+
if (Avx2.IsSupported && data.Length >= dimensions * 8)
46+
{
3547
// intel processors, 8 floats = 256 bits
3648
ProcessWithAVX(ptr, data.Length, dimensions, min, max);
37-
} else if (Vector.IsHardwareAccelerated && data.Length >= dimensions * Vector<float>.Count) {
49+
} else if (Vector.IsHardwareAccelerated && data.Length >= dimensions * Vector<float>.Count)
50+
{
3851
// on arm / apple silicon etc, Vector<float>.Count usually == 4. 4 floats = 128 bits
3952
ProcessWithVector(ptr, data.Length, dimensions, min, max);
40-
} else {
53+
} else
54+
{
4155
// and otherwise fall back to for loops and scalar operations, comparing one float at a time
4256
ProcessScalar(ptr, data.Length, dimensions, min, max);
4357
}
4458
}
4559
}
4660

4761
// ReSharper disable once InconsistentNaming
48-
private static unsafe void ProcessWithAVX(float* ptr, int length, int dimensions, float[] min, float[] max) {
62+
private static unsafe void ProcessWithAVX(float* ptr, int length, int dimensions, float[] min, float[] max)
63+
{
4964
var minVecs = new Vector256<float>[dimensions];
5065
var maxVecs = new Vector256<float>[dimensions];
5166

52-
for (int d = 0; d < dimensions; d++) {
67+
for (int d = 0; d < dimensions; d++)
68+
{
5369
minVecs[d] = Vector256.Create(float.MaxValue);
5470
maxVecs[d] = Vector256.Create(float.MinValue);
5571
}
5672

5773
int i = 0;
5874
int vectorizedLength = length - (length % (dimensions * 8));
5975

60-
for (; i < vectorizedLength; i += dimensions * 8) {
61-
for (int d = 0; d < dimensions; d++) {
76+
for (; i < vectorizedLength; i += dimensions * 8)
77+
{
78+
for (int d = 0; d < dimensions; d++)
79+
{
6280
var vec = Avx.LoadVector256(ptr + i + d * 8);
6381
minVecs[d] = Avx.Min(minVecs[d], vec);
6482
maxVecs[d] = Avx.Max(maxVecs[d], vec);
6583
}
6684
}
6785

6886
var temp = stackalloc float[8];
69-
for (int d = 0; d < dimensions; d++) {
87+
for (int d = 0; d < dimensions; d++)
88+
{
7089
Avx.Store(temp, minVecs[d]);
71-
for (int j = 0; j < 8; j++) {
90+
for (int j = 0; j < 8; j++)
91+
{
7292
min[d] = Math.Min(min[d], temp[j]);
7393
}
7494

7595
Avx.Store(temp, maxVecs[d]);
76-
for (int j = 0; j < 8; j++) {
96+
for (int j = 0; j < 8; j++)
97+
{
7798
max[d] = Math.Max(max[d], temp[j]);
7899
}
79100
}
80101

81102
ProcessRemainingElements(ptr, i, length, dimensions, min, max);
82103
}
83104

84-
private static unsafe void ProcessWithVector(float* ptr, int length, int dimensions, float[] min, float[] max) {
105+
private static unsafe void ProcessWithVector(float* ptr, int length, int dimensions, float[] min, float[] max)
106+
{
85107
var minVecs = new Vector<float>[dimensions];
86108
var maxVecs = new Vector<float>[dimensions];
87109
int vectorSize = Vector<float>.Count;
88110

89-
for (int d = 0; d < dimensions; d++) {
111+
for (int d = 0; d < dimensions; d++)
112+
{
90113
minVecs[d] = new Vector<float>(float.MaxValue);
91114
maxVecs[d] = new Vector<float>(float.MinValue);
92115
}
@@ -95,8 +118,10 @@ private static unsafe void ProcessWithVector(float* ptr, int length, int dimensi
95118
int vectorizedLength = length - (length % (dimensions * vectorSize));
96119

97120
// Main vectorized loop
98-
for (; i < vectorizedLength; i += dimensions * vectorSize) {
99-
for (int d = 0; d < dimensions; d++) {
121+
for (; i < vectorizedLength; i += dimensions * vectorSize)
122+
{
123+
for (int d = 0; d < dimensions; d++)
124+
{
100125
var span = new ReadOnlySpan<float>(ptr + i + d * vectorSize, vectorSize);
101126
var vec = new Vector<float>(span);
102127
minVecs[d] = Vector.Min(minVecs[d], vec);
@@ -105,11 +130,13 @@ private static unsafe void ProcessWithVector(float* ptr, int length, int dimensi
105130
}
106131

107132
// Reduce vectors to scalar values
108-
for (int d = 0; d < dimensions; d++) {
133+
for (int d = 0; d < dimensions; d++)
134+
{
109135
min[d] = float.MaxValue;
110136
max[d] = float.MinValue;
111137

112-
for (int j = 0; j < vectorSize; j++) {
138+
for (int j = 0; j < vectorSize; j++)
139+
{
113140
min[d] = Math.Min(min[d], minVecs[d][j]);
114141
max[d] = Math.Max(max[d], maxVecs[d][j]);
115142
}
@@ -118,18 +145,162 @@ private static unsafe void ProcessWithVector(float* ptr, int length, int dimensi
118145
ProcessRemainingElements(ptr, i, length, dimensions, min, max);
119146
}
120147

121-
private static unsafe void ProcessScalar(float* ptr, int length, int dimensions, float[] min, float[] max) {
122-
for (int i = 0; i < length; i += dimensions) {
123-
for (int d = 0; d < dimensions; d++) {
148+
// ReSharper disable once InconsistentNaming
149+
private static unsafe void ProcessSIMD3D(ReadOnlySpan<float> data, float[] min, float[] max)
150+
{
151+
fixed (float* ptr = data)
152+
{
153+
if (Avx2.IsSupported && data.Length >= 24)
154+
{
155+
ProcessWithAVX3D(ptr, data.Length, min, max);
156+
} else if (Vector.IsHardwareAccelerated && data.Length >= 12)
157+
{
158+
ProcessWithVector3D(ptr, data.Length, min, max);
159+
} else
160+
{
161+
ProcessScalar(ptr, data.Length, 3, min, max);
162+
}
163+
}
164+
}
165+
166+
167+
// ReSharper disable once InconsistentNaming
168+
private static unsafe void ProcessWithAVX3D(float* ptr, int length, float[] min, float[] max)
169+
{
170+
// Initialize vectors for each dimension
171+
var min0 = Vector256.Create(float.MaxValue);
172+
var min1 = Vector256.Create(float.MaxValue);
173+
var min2 = Vector256.Create(float.MaxValue);
174+
175+
var max0 = Vector256.Create(float.MinValue);
176+
var max1 = Vector256.Create(float.MinValue);
177+
var max2 = Vector256.Create(float.MinValue);
178+
179+
int i = 0;
180+
int vectorizedLength = length - (length % 24); // Process in chunks of 24 floats (8 vectors × 3 dimensions)
181+
182+
// Main processing loop - handles 8 vectors at a time
183+
for (; i < vectorizedLength; i += 24)
184+
{
185+
var c0 = Avx.LoadVector256(ptr + i);
186+
min0 = Avx.Min(min0, c0);
187+
max0 = Avx.Max(max0, c0);
188+
189+
var c1 = Avx.LoadVector256(ptr + i + 8);
190+
min1 = Avx.Min(min1, c1);
191+
max1 = Avx.Max(max1, c1);
192+
193+
var c2 = Avx.LoadVector256(ptr + i + 16);
194+
min2 = Avx.Min(min2, c2);
195+
max2 = Avx.Max(max2, c2);
196+
}
197+
198+
// Reduce the vectors to scalar values
199+
var temp = stackalloc float[8];
200+
201+
// Process min values
202+
Avx.Store(temp, min0);
203+
min[0] = temp[0];
204+
for (int j = 1; j < 8; j++) min[0] = Math.Min(min[0], temp[j]);
205+
206+
Avx.Store(temp, min1);
207+
min[1] = temp[0];
208+
for (int j = 1; j < 8; j++) min[1] = Math.Min(min[1], temp[j]);
209+
210+
Avx.Store(temp, min2);
211+
min[2] = temp[0];
212+
for (int j = 1; j < 8; j++) min[2] = Math.Min(min[2], temp[j]);
213+
214+
// Process max values
215+
Avx.Store(temp, max0);
216+
max[0] = temp[0];
217+
for (int j = 1; j < 8; j++) max[0] = Math.Max(max[0], temp[j]);
218+
219+
Avx.Store(temp, max1);
220+
max[1] = temp[0];
221+
for (int j = 1; j < 8; j++) max[1] = Math.Max(max[1], temp[j]);
222+
223+
Avx.Store(temp, max2);
224+
max[2] = temp[0];
225+
for (int j = 1; j < 8; j++) max[2] = Math.Max(max[2], temp[j]);
226+
227+
// Process remaining elements
228+
ProcessRemainingElements(ptr, i, length, 3, min, max);
229+
}
230+
231+
private static unsafe void ProcessWithVector3D(float* ptr, int length, float[] min, float[] max)
232+
{
233+
int vectorSize = Vector<float>.Count;
234+
235+
// Initialize vectors for each dimension
236+
var min0 = new Vector<float>(float.MaxValue);
237+
var min1 = new Vector<float>(float.MaxValue);
238+
var min2 = new Vector<float>(float.MaxValue);
239+
240+
var max0 = new Vector<float>(float.MinValue);
241+
var max1 = new Vector<float>(float.MinValue);
242+
var max2 = new Vector<float>(float.MinValue);
243+
244+
int i = 0;
245+
int vectorizedLength = length - (length % (3 * vectorSize));
246+
247+
// Main processing loop
248+
for (; i < vectorizedLength; i += 3 * vectorSize)
249+
{
250+
var vec0 = new Vector<float>(new ReadOnlySpan<float>(ptr + i, vectorSize));
251+
min0 = Vector.Min(min0, vec0);
252+
max0 = Vector.Max(max0, vec0);
253+
254+
var vec1 = new Vector<float>(new ReadOnlySpan<float>(ptr + i + vectorSize, vectorSize));
255+
min1 = Vector.Min(min1, vec1);
256+
max1 = Vector.Max(max1, vec1);
257+
258+
var vec2 = new Vector<float>(new ReadOnlySpan<float>(ptr + i + 2 * vectorSize, vectorSize));
259+
min2 = Vector.Min(min2, vec2);
260+
max2 = Vector.Max(max2, vec2);
261+
}
262+
263+
// Reduce vectors to scalar values
264+
min[0] = float.MaxValue;
265+
min[1] = float.MaxValue;
266+
min[2] = float.MaxValue;
267+
max[0] = float.MinValue;
268+
max[1] = float.MinValue;
269+
max[2] = float.MinValue;
270+
271+
for (int j = 0; j < Vector<float>.Count; j++)
272+
{
273+
min[0] = Math.Min(min[0], min0[j]);
274+
min[1] = Math.Min(min[1], min1[j]);
275+
min[2] = Math.Min(min[2], min2[j]);
276+
277+
max[0] = Math.Max(max[0], max0[j]);
278+
max[1] = Math.Max(max[1], max1[j]);
279+
max[2] = Math.Max(max[2], max2[j]);
280+
}
281+
282+
// Process remaining elements
283+
ProcessRemainingElements(ptr, i, length, 3, min, max);
284+
}
285+
286+
private static unsafe void ProcessScalar(float* ptr, int length, int dimensions, float[] min, float[] max)
287+
{
288+
for (int i = 0; i < length; i += dimensions)
289+
{
290+
for (int d = 0; d < dimensions; d++)
291+
{
124292
min[d] = Math.Min(min[d], ptr[i + d]);
125293
max[d] = Math.Max(max[d], ptr[i + d]);
126294
}
127295
}
128296
}
129297

130-
private static unsafe void ProcessRemainingElements(float* ptr, int start, int length, int dimensions, float[] min, float[] max) {
131-
for (int i = start; i < length; i += dimensions) {
132-
for (int d = 0; d < dimensions; d++) {
298+
private static unsafe void ProcessRemainingElements(float* ptr, int start, int length, int dimensions, float[] min, float[] max)
299+
{
300+
for (int i = start; i < length; i += dimensions)
301+
{
302+
for (int d = 0; d < dimensions; d++)
303+
{
133304
min[d] = Math.Min(min[d], ptr[i + d]);
134305
max[d] = Math.Max(max[d], ptr[i + d]);
135306
}

src/SharpGLTF.Core/SharpGLTF.Core.csproj

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,8 @@
2828
</None>
2929
</ItemGroup>
3030

31+
<ItemGroup>
32+
<PackageReference Include="System.Numerics.Tensors" Version="8.0.0" />
33+
</ItemGroup>
34+
3135
</Project>

src/SharpGLTF.Toolkit/.DS_Store

6 KB
Binary file not shown.
6 KB
Binary file not shown.

0 commit comments

Comments
 (0)