Skip to content

Commit cd79226

Browse files
committed
Test for OpenCL Comb-Sort
1 parent 53a7107 commit cd79226

File tree

3 files changed

+144
-27
lines changed

3 files changed

+144
-27
lines changed

src/pyFAI/opencl/test/test_collective.py

Lines changed: 92 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
__contact__ = "jerome.kieffer@esrf.eu"
3434
__license__ = "MIT"
3535
__copyright__ = "2013 European Synchrotron Radiation Facility, Grenoble, France"
36-
__date__ = "07/11/2024"
36+
__date__ = "12/11/2024"
3737

3838
import logging
3939
import numpy
@@ -51,11 +51,11 @@
5151

5252
@unittest.skipIf(UtilsTest.opencl is False, "User request to skip OpenCL tests")
5353
@unittest.skipUnless(ocl, "PyOpenCl is missing")
54-
class TestReduction(unittest.TestCase):
54+
class TestGroupFunction(unittest.TestCase):
5555

5656
@classmethod
5757
def setUpClass(cls):
58-
super(TestReduction, cls).setUpClass()
58+
super(TestGroupFunction, cls).setUpClass()
5959

6060
if ocl:
6161
cls.ctx = ocl.create_context()
@@ -74,8 +74,8 @@ def setUpClass(cls):
7474

7575
@classmethod
7676
def tearDownClass(cls):
77-
super(TestReduction, cls).tearDownClass()
78-
print("Maximum valid workgroup size %s on device %s" % (cls.max_valid_wg, cls.ctx.devices[0]))
77+
super(TestGroupFunction, cls).tearDownClass()
78+
# print("Maximum valid workgroup size %s on device %s" % (cls.max_valid_wg, cls.ctx.devices[0]))
7979
cls.ctx = None
8080
cls.queue = None
8181

@@ -88,8 +88,8 @@ def setUp(self):
8888
self.data_d = pyopencl.array.to_device(self.queue, self.data)
8989
self.sum_d = pyopencl.array.zeros_like(self.data_d)
9090
self.program = pyopencl.Program(self.ctx, get_opencl_code("pyfai:openCL/collective/reduction.cl")+
91-
get_opencl_code("pyfai:openCL/collective/scan.cl")
92-
).build()
91+
get_opencl_code("pyfai:openCL/collective/scan.cl")+
92+
get_opencl_code("pyfai:openCL/collective/comb_sort.cl")).build()
9393

9494
def tearDown(self):
9595
self.img = self.data = None
@@ -230,10 +230,94 @@ def test_Blelloch_multipass(self):
230230
logger.info("Wg: %s result: cumsum good: %s", wg, good)
231231
self.assertTrue(good, "calculation is correct for WG=%s" % wg)
232232

233+
234+
@unittest.skipUnless(ocl, "pyopencl is missing")
235+
def test_sort(self):
236+
"""
237+
tests the sort of floating points in a workgroup
238+
"""
239+
data = numpy.arange(self.shape).astype(numpy.float32)
240+
numpy.random.shuffle(data)
241+
data_d = pyopencl.array.to_device(self.queue, data)
242+
243+
maxi = int(round(numpy.log2(self.shape)))+1
244+
for i in range(5,maxi):
245+
wg = 1 << i
246+
247+
ref = data.reshape((-1, wg))
248+
positions = ((numpy.arange(ref.shape[0])+1)*wg).astype(numpy.int32)
249+
positions_d = pyopencl.array.to_device(self.queue, positions)
250+
data_d = pyopencl.array.to_device(self.queue, data)
251+
# print(ref.shape, (ref.shape[0],min(wg, self.max_valid_wg)), (1, min(wg, self.max_valid_wg)), positions)
252+
try:
253+
evt = self.program.test_combsort_float(self.queue, (ref.shape[0],min(wg, self.max_valid_wg)), (1, min(wg, self.max_valid_wg)),
254+
data_d.data,
255+
positions_d.data,
256+
pyopencl.LocalMemory(4*min(wg, self.max_valid_wg)))
257+
evt.wait()
258+
except Exception as error:
259+
logger.error("Error %s on WG=%s: test_sort", error, wg)
260+
break
261+
else:
262+
res = data_d.get()
263+
ref = numpy.sort(ref)
264+
good = numpy.allclose(res, ref.ravel())
265+
logger.info("Wg: %s result: sort OK %s", wg, good)
266+
if not good:
267+
print(res.reshape(ref.shape))
268+
print(ref)
269+
print(numpy.where(res.reshape(ref.shape)-ref))
270+
271+
self.assertTrue(good, "calculation is correct for WG=%s" % wg)
272+
273+
@unittest.skipUnless(ocl, "pyopencl is missing")
274+
def test_sort4(self):
275+
"""
276+
tests the sort of floating points in a workgroup
277+
"""
278+
data = numpy.arange(self.shape).astype(numpy.float32)
279+
data = numpy.outer(data, numpy.ones(4, numpy.float32)).view(numpy.dtype([("s0","<f4"),("s1","<f4"),("s2","<f4"),("s3","<f4")]))
280+
numpy.random.shuffle(data)
281+
data_d = pyopencl.array.to_device(self.queue, data)
282+
283+
maxi = int(round(numpy.log2(self.shape)))+1
284+
for i in range(5,maxi):
285+
wg = 1 << i
286+
287+
ref = data.reshape((-1, wg))
288+
positions = ((numpy.arange(ref.shape[0])+1)*wg).astype(numpy.int32)
289+
positions_d = pyopencl.array.to_device(self.queue, positions)
290+
data_d = pyopencl.array.to_device(self.queue, data)
291+
# print(ref.shape, (ref.shape[0],min(wg, self.max_valid_wg)), (1, min(wg, self.max_valid_wg)), positions)
292+
try:
293+
evt = self.program.test_combsort_float4(self.queue, (ref.shape[0],min(wg, self.max_valid_wg)), (1, min(wg, self.max_valid_wg)),
294+
data_d.data,
295+
positions_d.data,
296+
pyopencl.LocalMemory(4*min(wg, self.max_valid_wg)))
297+
evt.wait()
298+
except Exception as error:
299+
logger.error("Error %s on WG=%s: test_sort", error, wg)
300+
break
301+
else:
302+
res = data_d.get()
303+
# print(res.dtype)
304+
ref = numpy.sort(ref, order="s0")
305+
# print(ref.dtype)
306+
good = numpy.allclose(res.view(numpy.float32).ravel(), ref.view(numpy.float32).ravel())
307+
logger.info("Wg: %s result: sort OK %s", wg, good)
308+
if not good:
309+
print(res.reshape(ref.shape))
310+
print(ref)
311+
print(numpy.where(res.reshape(ref.shape)-ref))
312+
313+
self.assertTrue(good, "calculation is correct for WG=%s" % wg)
314+
315+
316+
233317
def suite():
234318
loader = unittest.defaultTestLoader.loadTestsFromTestCase
235319
testSuite = unittest.TestSuite()
236-
testSuite.addTest(loader(TestReduction))
320+
testSuite.addTest(loader(TestGroupFunction))
237321
return testSuite
238322

239323

src/pyFAI/resources/openCL/collective/comb_sort.cl

Lines changed: 50 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,46 @@ int inline first_step(int step, int size, float ratio)
2323
return step;
2424
}
2525

26+
// returns 1 if swapped, else 0
27+
int compare_and_swap(global volatile float* elements, int i, int j)
28+
{
29+
float vi = elements[i];
30+
float vj = elements[j];
31+
if (vi>vj)
32+
{
33+
elements[i] = vj;
34+
elements[j] = vi;
35+
return 1;
36+
}
37+
else
38+
return 0;
39+
}
40+
41+
// returns 1 if swapped, else 0
42+
int compare_and_swap_float4(global volatile float4* elements, int i, int j)
43+
{
44+
float4 vi = elements[i];
45+
float4 vj = elements[j];
46+
if (vi.s0>vj.s0)
47+
{
48+
elements[i] = vj;
49+
elements[j] = vi;
50+
return 1;
51+
}
52+
else
53+
return 0;
54+
}
55+
2656

2757

2858
// returns the number of swap performed
2959
int passe(global volatile float* elements,
3060
int size,
3161
int step,
32-
local volatile int* shared)
62+
local int* shared)
3363
{
34-
int wg = get_local_size(0);
35-
int tid = get_local_id(0);
64+
int wg = get_local_size(1);
65+
int tid = get_local_id(1);
3666
int cnt = 0;
3767
int i, j, k;
3868
barrier(CLK_GLOBAL_MEM_FENCE);
@@ -76,7 +106,7 @@ int passe(global volatile float* elements,
76106
if (step==1)
77107
{
78108
shared[tid] = cnt;
79-
return sum_reduction(shared);
109+
return sum_int_reduction(shared);
80110
}
81111
else
82112
return 0;
@@ -88,10 +118,10 @@ int passe(global volatile float* elements,
88118
int passe_float4(global volatile float4* elements,
89119
int size,
90120
int step,
91-
local volatile int* shared)
121+
local int* shared)
92122
{
93-
int wg = get_local_size(0);
94-
int tid = get_local_id(0);
123+
int wg = get_local_size(1);
124+
int tid = get_local_id(1);
95125
int cnt = 0;
96126
int i, j, k;
97127

@@ -135,19 +165,20 @@ int passe_float4(global volatile float4* elements,
135165
if (step==1)
136166
{
137167
shared[tid] = cnt;
138-
return sum_reduction(shared);
168+
return sum_int_reduction(shared);
139169
}
140170
else
141171
return 0;
142172
}
143173

144-
// workgroup: (wg, 1) grid:(wg, nb_lines), shared wg*sizeof(int)
145-
kernel void test_combsort_many(global volatile float* elements,
146-
global int* positions,
147-
local int* shared)
174+
// workgroup: (1, wg)
175+
// grid: (nb_lines, wg)
176+
// shared: wg*sizeof(int)
177+
kernel void test_combsort_float(global volatile float* elements,
178+
global int* positions,
179+
local int* shared)
148180
{
149-
local volatile int shared[1024];
150-
int gid = get_group_id(1);
181+
int gid = get_group_id(0);
151182
int step = 11; // magic value
152183
float ratio=1.3f; // magic value
153184
int cnt;
@@ -171,12 +202,14 @@ kernel void test_combsort_many(global volatile float* elements,
171202

172203
}
173204

174-
// workgroup: (wg, 1) grid:(wg, nb_lines), shared wg*sizeof(int)
205+
// workgroup: (1, wg)
206+
// grid: (nb_lines, wg)
207+
// shared: wg*sizeof(int)
175208
kernel void test_combsort_float4(global volatile float4* elements,
176209
global int* positions,
177-
local volatile int* shared)
210+
local int* shared)
178211
{
179-
int gid = get_group_id(1);
212+
int gid = get_group_id(0);
180213
int step = 11; // magic value
181214
float ratio=1.3f; // magic value
182215
int cnt;

src/pyFAI/resources/openCL/collective/reduction.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66

77
int inline sum_int_reduction(local int* shared)
88
{
9-
int wg = get_local_size(0);
10-
int tid = get_local_id(0);
9+
int wg = get_local_size(0) * get_local_size(1);
10+
int tid = get_local_id(0) + get_local_size(0)*get_local_id(1);
1111

1212
// local reduction based implementation
1313
for (int stride=wg>>1; stride>0; stride>>=1)

0 commit comments

Comments
 (0)