Skip to content

Commit 1ec236c

Browse files
committed
Revert "Added support for work group collective functions in xasum routine."
This reverts commit 350e57a.
1 parent 2767448 commit 1ec236c

File tree

1 file changed

+23
-47
lines changed

1 file changed

+23
-47
lines changed

src/kernels/level1/xasum.opencl

Lines changed: 23 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,6 @@
1414
// literal). Comment-out this line for syntax-highlighting when developing.
1515
R"(
1616

17-
#ifdef cl_khr_work_group_uniform_arithmetic
18-
#pragma OPENCL EXTENSION cl_khr_work_group_uniform_arithmetic : enable
19-
#endif
20-
2117
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
2218
// this kernel file is used outside of the CLBlast library.
2319
#ifndef WGS1
@@ -59,25 +55,18 @@ void Xasum(const int n,
5955
lm[lid] = acc;
6056
barrier(CLK_LOCAL_MEM_FENCE);
6157

62-
// Performs reduction in local memory and stores the per work group result
63-
#if defined(cl_khr_work_group_uniform_arithmetic) || defined(__opencl_c_work_group_collective_functions)
64-
real result = work_group_reduce_add(lm[lid])
65-
66-
if (lid == 0) {
67-
output[wgid] = result;
68-
}
69-
#else
70-
for (int s=WGS1/2; s>0; s=s>>1) {
71-
if (lid < s) {
72-
Add(lm[lid], lm[lid], lm[lid + s]);
73-
}
74-
barrier(CLK_LOCAL_MEM_FENCE);
58+
// Performs reduction in local memory
59+
for (int s=WGS1/2; s>0; s=s>>1) {
60+
if (lid < s) {
61+
Add(lm[lid], lm[lid], lm[lid + s]);
7562
}
63+
barrier(CLK_LOCAL_MEM_FENCE);
64+
}
7665

77-
if (lid == 0) {
78-
output[wgid] = lm[0];
79-
}
80-
#endif
66+
// Stores the per-workgroup result
67+
if (lid == 0) {
68+
output[wgid] = lm[0];
69+
}
8170
}
8271

8372
// =================================================================================================
@@ -98,35 +87,22 @@ void XasumEpilogue(const __global real* restrict input,
9887
Add(lm[lid], input[lid], input[lid + WGS2]);
9988
barrier(CLK_LOCAL_MEM_FENCE);
10089

101-
// Performs reduction in local memory and stores the final result of the absolute value
102-
#if defined(cl_khr_work_group_uniform_arithmetic) || defined(__opencl_c_work_group_collective_functions)
103-
real result = work_group_reduce_add(lm[lid])
104-
105-
if (lid == 0) {
106-
#if (PRECISION == 3232 || PRECISION == 6464) && defined(ROUTINE_ASUM)
107-
asum[asum_offset].x = real.x + real.y; // the result is a non-complex number
108-
#else
109-
asum[asum_offset] = real;
110-
#endif
90+
// Performs reduction in local memory
91+
for (int s=WGS2/2; s>0; s=s>>1) {
92+
if (lid < s) {
93+
Add(lm[lid], lm[lid], lm[lid + s]);
11194
}
112-
#else
113-
for (int s=WGS1/2; s>0; s=s>>1) {
114-
if (lid < s) {
115-
Add(lm[lid], lm[lid], lm[lid + s]);
116-
}
117-
barrier(CLK_LOCAL_MEM_FENCE);
118-
}
119-
120-
if (lid == 0) {
121-
#if (PRECISION == 3232 || PRECISION == 6464) && defined(ROUTINE_ASUM)
122-
asum[asum_offset].x = lm[0].x + lm[0].y; // the result is a non-complex number
123-
#else
124-
asum[asum_offset] = lm[0];
125-
#endif
126-
}
127-
#endif
95+
barrier(CLK_LOCAL_MEM_FENCE);
96+
}
12897

12998
// Computes the absolute value and stores the final result
99+
if (lid == 0) {
100+
#if (PRECISION == 3232 || PRECISION == 6464) && defined(ROUTINE_ASUM)
101+
asum[asum_offset].x = lm[0].x + lm[0].y; // the result is a non-complex number
102+
#else
103+
asum[asum_offset] = lm[0];
104+
#endif
105+
}
130106
}
131107

132108
// =================================================================================================

0 commit comments

Comments
 (0)