@@ -105,8 +105,8 @@ struct reduce<Config, BinOp, 2, device_capabilities>
105
105
scan_local = reduction0 (scan_local);
106
106
if (glsl::gl_SubgroupInvocationID ()==Config::SubgroupSize-1 )
107
107
{
108
- const uint32_t virtualSubgroupID = idx * ( Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID ();
109
- const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1 )) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
108
+ const uint32_t virtualSubgroupID = Config::virtualSubgroupID ( glsl::gl_SubgroupID (), idx );
109
+ const uint32_t bankedIndex = Config:: sharedMemCoalescedIndex (virtualSubgroupID, Config::ItemsPerInvocation_1); // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
110
110
scratchAccessor.template set<scalar_t>(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1 ]); // set last element of subgroup scan (reduction) to level 1 scan
111
111
}
112
112
}
@@ -165,8 +165,8 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
165
165
dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
166
166
if (glsl::gl_SubgroupInvocationID ()==Config::SubgroupSize-1 )
167
167
{
168
- const uint32_t virtualSubgroupID = idx * ( Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID ();
169
- const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1 )) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
168
+ const uint32_t virtualSubgroupID = Config::virtualSubgroupID ( glsl::gl_SubgroupID (), idx );
169
+ const uint32_t bankedIndex = Config:: sharedMemCoalescedIndex (virtualSubgroupID, Config::ItemsPerInvocation_1); // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
170
170
scratchAccessor.template set<scalar_t>(bankedIndex, value[Config::ItemsPerInvocation_0-1 ]); // set last element of subgroup scan (reduction) to level 1 scan
171
171
}
172
172
}
@@ -194,7 +194,7 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
194
194
vector_lv0_t value;
195
195
dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
196
196
197
- const uint32_t virtualSubgroupID = idx * ( Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID ();
197
+ const uint32_t virtualSubgroupID = Config::virtualSubgroupID ( glsl::gl_SubgroupID (), idx );
198
198
scalar_t left;
199
199
scratchAccessor.template get<scalar_t>(virtualSubgroupID,left);
200
200
if (Exclusive)
@@ -244,8 +244,8 @@ struct reduce<Config, BinOp, 3, device_capabilities>
244
244
scan_local = reduction0 (scan_local);
245
245
if (glsl::gl_SubgroupInvocationID ()==Config::SubgroupSize-1 )
246
246
{
247
- const uint32_t virtualSubgroupID = idx * ( Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID ();
248
- const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1 )) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
247
+ const uint32_t virtualSubgroupID = Config::virtualSubgroupID ( glsl::gl_SubgroupID (), idx );
248
+ const uint32_t bankedIndex = Config:: sharedMemCoalescedIndex (virtualSubgroupID, Config::ItemsPerInvocation_1); // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
249
249
scratchAccessor.template set<scalar_t>(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1 ]); // set last element of subgroup scan (reduction) to level 1 scan
250
250
}
251
251
}
@@ -262,7 +262,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
262
262
lv1_val = reduction1 (lv1_val);
263
263
if (glsl::gl_SubgroupInvocationID ()==Config::SubgroupSize-1 )
264
264
{
265
- const uint32_t bankedIndex = (invocationIndex & (Config::ItemsPerInvocation_2-1 )) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2);
265
+ const uint32_t bankedIndex = Config:: sharedMemCoalescedIndex (invocationIndex, Config::ItemsPerInvocation_2); // (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2);
266
266
scratchAccessor.template set<scalar_t>(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1 ]);
267
267
}
268
268
}
@@ -321,8 +321,8 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
321
321
dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
322
322
if (glsl::gl_SubgroupInvocationID ()==Config::SubgroupSize-1 )
323
323
{
324
- const uint32_t virtualSubgroupID = idx * ( Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID ();
325
- const uint32_t bankedIndex = (virtualSubgroupID & ( Config::ItemsPerInvocation_1- 1 )) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/ Config::ItemsPerInvocation_1);
324
+ const uint32_t virtualSubgroupID = Config::virtualSubgroupID ( glsl::gl_SubgroupID (), idx );
325
+ const uint32_t bankedIndex = Config::sharedMemCoalescedIndex (virtualSubgroupID, Config::ItemsPerInvocation_1);
326
326
scratchAccessor.template set<scalar_t>(bankedIndex, value[Config::ItemsPerInvocation_0-1 ]); // set last element of subgroup scan (reduction) to level 1 scan
327
327
}
328
328
}
@@ -340,7 +340,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
340
340
lv1_val = inclusiveScan1 (lv1_val);
341
341
if (glsl::gl_SubgroupInvocationID ()==Config::SubgroupSize-1 )
342
342
{
343
- const uint32_t bankedIndex = (glsl::gl_SubgroupID () & (Config::ItemsPerInvocation_2-1 )) * Config::SubgroupSize + (glsl::gl_SubgroupID ()/Config::ItemsPerInvocation_2);
343
+ const uint32_t bankedIndex = Config:: sharedMemCoalescedIndex (glsl:: gl_SubgroupID (), Config::ItemsPerInvocation_2); // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
344
344
scratchAccessor.template set<scalar_t>(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1 ]);
345
345
}
346
346
}
@@ -378,7 +378,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
378
378
vector_lv0_t value;
379
379
dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
380
380
381
- const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID ();
381
+ const uint32_t virtualSubgroupID = Config:: virtualSubgroupID (glsl:: gl_SubgroupID (), idx); // idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID();
382
382
const scalar_t left;
383
383
scratchAccessor.template get<scalar_t>(virtualSubgroupID, left);
384
384
if (Exclusive)
0 commit comments