1414// literal). Comment-out this line for syntax-highlighting when developing.
1515R "(
1616
17- #ifdef cl_khr_work_group_uniform_arithmetic
18- #pragma OPENCL EXTENSION cl_khr_work_group_uniform_arithmetic : enable
19- #endif
20-
2117// Parameters set by the tuner or by the database. Here they are given a basic default value in case
2218// this kernel file is used outside of the CLBlast library.
2319#ifndef WGS1
@@ -59,25 +55,18 @@ void Xasum(const int n,
5955 lm [lid ] = acc ;
6056 barrier (CLK_LOCAL_MEM_FENCE );
6157
62- // Performs reduction in local memory and stores the per work group result
63- #if defined(cl_khr_work_group_uniform_arithmetic ) || defined(__opencl_c_work_group_collective_functions )
64- real result = work_group_reduce_add (lm [lid ])
65-
66- if (lid == 0 ) {
67- output [wgid ] = result ;
68- }
69- #else
70- for (int s = WGS1 /2 ; s > 0 ; s = s >>1 ) {
71- if (lid < s ) {
72- Add (lm [lid ], lm [lid ], lm [lid + s ]);
73- }
74- barrier (CLK_LOCAL_MEM_FENCE );
58+ // Performs reduction in local memory
59+ for (int s = WGS1 /2 ; s > 0 ; s = s >>1 ) {
60+ if (lid < s ) {
61+ Add (lm [lid ], lm [lid ], lm [lid + s ]);
7562 }
63+ barrier (CLK_LOCAL_MEM_FENCE );
64+ }
7665
77- if ( lid == 0 ) {
78- output [ wgid ] = lm [ 0 ];
79- }
80- #endif
66+ // Stores the per-workgroup result
67+ if ( lid == 0 ) {
68+ output [ wgid ] = lm [ 0 ];
69+ }
8170}
8271
8372// =================================================================================================
@@ -98,35 +87,22 @@ void XasumEpilogue(const __global real* restrict input,
9887 Add (lm [lid ], input [lid ], input [lid + WGS2 ]);
9988 barrier (CLK_LOCAL_MEM_FENCE );
10089
101- // Performs reduction in local memory and stores the final result of the absolute value
102- #if defined(cl_khr_work_group_uniform_arithmetic ) || defined(__opencl_c_work_group_collective_functions )
103- real result = work_group_reduce_add (lm [lid ])
104-
105- if (lid == 0 ) {
106- #if (PRECISION == 3232 || PRECISION == 6464 ) && defined(ROUTINE_ASUM )
107- asum [asum_offset ].x = real .x + real .y ; // the result is a non-complex number
108- #else
109- asum [asum_offset ] = real ;
110- #endif
90+ // Performs reduction in local memory
91+ for (int s = WGS2 /2 ; s > 0 ; s = s >>1 ) {
92+ if (lid < s ) {
93+ Add (lm [lid ], lm [lid ], lm [lid + s ]);
11194 }
112- #else
113- for (int s = WGS1 /2 ; s > 0 ; s = s >>1 ) {
114- if (lid < s ) {
115- Add (lm [lid ], lm [lid ], lm [lid + s ]);
116- }
117- barrier (CLK_LOCAL_MEM_FENCE );
118- }
119-
120- if (lid == 0 ) {
121- #if (PRECISION == 3232 || PRECISION == 6464 ) && defined(ROUTINE_ASUM )
122- asum [asum_offset ].x = lm [0 ].x + lm [0 ].y ; // the result is a non-complex number
123- #else
124- asum [asum_offset ] = lm [0 ];
125- #endif
126- }
127- #endif
95+ barrier (CLK_LOCAL_MEM_FENCE );
96+ }
12897
12998 // Computes the absolute value and stores the final result
99+ if (lid == 0 ) {
100+ #if (PRECISION == 3232 || PRECISION == 6464 ) && defined(ROUTINE_ASUM )
101+ asum [asum_offset ].x = lm [0 ].x + lm [0 ].y ; // the result is a non-complex number
102+ #else
103+ asum [asum_offset ] = lm [0 ];
104+ #endif
105+ }
130106}
131107
132108// =================================================================================================
0 commit comments