@@ -104,10 +104,10 @@ struct reduce<Config, BinOp, 2, device_capabilities>
104
104
vector_lv0_t scan_local;
105
105
dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local);
106
106
scan_local = reduction0 (scan_local);
107
- if (glsl:: gl_SubgroupInvocationID ()==Config::SubgroupSize- 1 )
107
+ if (Config:: electLast () )
108
108
{
109
109
const uint32_t virtualSubgroupID = Config::virtualSubgroupID (glsl::gl_SubgroupID (), idx);
110
- const uint32_t bankedIndex = Config::sharedMemCoalescedIndex (virtualSubgroupID, Config::ItemsPerInvocation_1); // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
110
+ const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel (virtualSubgroupID, Config::ItemsPerInvocation_1); // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
111
111
scratchAccessor.template set<scalar_t>(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1 ]); // set last element of subgroup scan (reduction) to level 1 scan
112
112
}
113
113
}
@@ -120,10 +120,10 @@ struct reduce<Config, BinOp, 2, device_capabilities>
120
120
vector_lv1_t lv1_val;
121
121
[unroll]
122
122
for (uint32_t i = 0 ; i < Config::ItemsPerInvocation_1; i++)
123
- scratchAccessor.template get<scalar_t>(i* Config::SubgroupSize+ invocationIndex,lv1_val[i]);
123
+ scratchAccessor.template get<scalar_t>(Config::sharedCoalescedIndexByComponent ( invocationIndex, i) ,lv1_val[i]);
124
124
lv1_val = reduction1 (lv1_val);
125
125
126
- if (glsl:: gl_SubgroupInvocationID ()==Config::SubgroupSize- 1 )
126
+ if (Config:: electLast () )
127
127
scratchAccessor.template set<scalar_t>(0 , lv1_val[Config::ItemsPerInvocation_1-1 ]);
128
128
}
129
129
scratchAccessor.workgroupExecutionAndMemoryBarrier ();
@@ -159,10 +159,10 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
159
159
dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
160
160
value = inclusiveScan0 (value);
161
161
dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
162
- if (glsl:: gl_SubgroupInvocationID ()==Config::SubgroupSize- 1 )
162
+ if (Config:: electLast () )
163
163
{
164
164
const uint32_t virtualSubgroupID = Config::virtualSubgroupID (glsl::gl_SubgroupID (), idx);
165
- const uint32_t bankedIndex = Config::sharedMemCoalescedIndex (virtualSubgroupID, Config::ItemsPerInvocation_1); // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
165
+ const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel (virtualSubgroupID, Config::ItemsPerInvocation_1); // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
166
166
scratchAccessor.template set<scalar_t>(bankedIndex, value[Config::ItemsPerInvocation_0-1 ]); // set last element of subgroup scan (reduction) to level 1 scan
167
167
}
168
168
}
@@ -176,12 +176,12 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
176
176
const uint32_t prevIndex = invocationIndex-1 ;
177
177
[unroll]
178
178
for (uint32_t i = 0 ; i < Config::ItemsPerInvocation_1; i++)
179
- scratchAccessor.template get<scalar_t>(i* Config::SubgroupSize+ prevIndex,lv1_val[i]);
179
+ scratchAccessor.template get<scalar_t>(Config::sharedCoalescedIndexByComponent ( prevIndex, i) ,lv1_val[i]);
180
180
lv1_val[0 ] = hlsl::mix (BinOp::identity, lv1_val[0 ], bool (invocationIndex));
181
181
lv1_val = inclusiveScan1 (lv1_val);
182
182
[unroll]
183
183
for (uint32_t i = 0 ; i < Config::ItemsPerInvocation_1; i++)
184
- scratchAccessor.template set<scalar_t>(i* Config::SubgroupSize+ invocationIndex,lv1_val[i]);
184
+ scratchAccessor.template set<scalar_t>(Config::sharedCoalescedIndexByComponent ( invocationIndex, i) ,lv1_val[i]);
185
185
}
186
186
scratchAccessor.workgroupExecutionAndMemoryBarrier ();
187
187
@@ -193,7 +193,7 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
193
193
dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
194
194
195
195
const uint32_t virtualSubgroupID = Config::virtualSubgroupID (glsl::gl_SubgroupID (), idx);
196
- const uint32_t bankedIndex = Config::sharedMemCoalescedIndex (virtualSubgroupID, Config::ItemsPerInvocation_1);
196
+ const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel (virtualSubgroupID, Config::ItemsPerInvocation_1);
197
197
scalar_t left;
198
198
scratchAccessor.template get<scalar_t>(bankedIndex,left);
199
199
if (Exclusive)
@@ -242,10 +242,10 @@ struct reduce<Config, BinOp, 3, device_capabilities>
242
242
vector_lv0_t scan_local;
243
243
dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local);
244
244
scan_local = reduction0 (scan_local);
245
- if (glsl:: gl_SubgroupInvocationID ()==Config::SubgroupSize- 1 )
245
+ if (Config:: electLast () )
246
246
{
247
247
const uint32_t virtualSubgroupID = Config::virtualSubgroupID (glsl::gl_SubgroupID (), idx);
248
- const uint32_t bankedIndex = Config::sharedMemCoalescedIndex (virtualSubgroupID, Config::ItemsPerInvocation_1); // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
248
+ const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel (virtualSubgroupID, Config::ItemsPerInvocation_1); // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
249
249
scratchAccessor.template set<scalar_t>(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1 ]); // set last element of subgroup scan (reduction) to level 1 scan
250
250
}
251
251
}
@@ -258,11 +258,11 @@ struct reduce<Config, BinOp, 3, device_capabilities>
258
258
vector_lv1_t lv1_val;
259
259
[unroll]
260
260
for (uint32_t i = 0 ; i < Config::ItemsPerInvocation_1; i++)
261
- scratchAccessor.template get<scalar_t>(i* Config::SubgroupSize+ invocationIndex,lv1_val[i]);
261
+ scratchAccessor.template get<scalar_t>(Config::sharedCoalescedIndexByComponent ( invocationIndex, i) ,lv1_val[i]);
262
262
lv1_val = reduction1 (lv1_val);
263
- if (glsl:: gl_SubgroupInvocationID ()==Config::SubgroupSize- 1 )
263
+ if (Config:: electLast () )
264
264
{
265
- const uint32_t bankedIndex = Config::sharedMemCoalescedIndex (invocationIndex, Config::ItemsPerInvocation_2); // (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2);
265
+ const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel (invocationIndex, Config::ItemsPerInvocation_2); // (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2);
266
266
scratchAccessor.template set<scalar_t>(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1 ]);
267
267
}
268
268
}
@@ -275,7 +275,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
275
275
vector_lv2_t lv2_val;
276
276
[unroll]
277
277
for (uint32_t i = 0 ; i < Config::ItemsPerInvocation_2; i++)
278
- scratchAccessor.template get<scalar_t>(i* Config::SubgroupSize+ invocationIndex,lv2_val[i]);
278
+ scratchAccessor.template get<scalar_t>(Config::sharedCoalescedIndexByComponent ( invocationIndex, i) ,lv2_val[i]);
279
279
lv2_val = reduction2 (lv2_val);
280
280
scratchAccessor.template set<scalar_t>(invocationIndex, lv2_val[Config::ItemsPerInvocation_2-1 ]);
281
281
}
@@ -314,10 +314,10 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
314
314
dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
315
315
value = inclusiveScan0 (value);
316
316
dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
317
- if (glsl:: gl_SubgroupInvocationID ()==Config::SubgroupSize- 1 )
317
+ if (Config:: electLast () )
318
318
{
319
319
const uint32_t virtualSubgroupID = Config::virtualSubgroupID (glsl::gl_SubgroupID (), idx);
320
- const uint32_t bankedIndex = Config::sharedMemCoalescedIndex (virtualSubgroupID, Config::ItemsPerInvocation_1);
320
+ const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel (virtualSubgroupID, Config::ItemsPerInvocation_1);
321
321
scratchAccessor.template set<scalar_t>(bankedIndex, value[Config::ItemsPerInvocation_0-1 ]); // set last element of subgroup scan (reduction) to level 1 scan
322
322
}
323
323
}
@@ -332,15 +332,15 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
332
332
const uint32_t prevIndex = invocationIndex-1 ;
333
333
[unroll]
334
334
for (uint32_t i = 0 ; i < Config::ItemsPerInvocation_1; i++)
335
- scratchAccessor.template get<scalar_t>(i* Config::SubgroupSize+ prevIndex,lv1_val[i]);
335
+ scratchAccessor.template get<scalar_t>(Config::sharedCoalescedIndexByComponent ( prevIndex, i) ,lv1_val[i]);
336
336
lv1_val[0 ] = hlsl::mix (BinOp::identity, lv1_val[0 ], bool (invocationIndex));
337
337
lv1_val = inclusiveScan1 (lv1_val);
338
338
[unroll]
339
339
for (uint32_t i = 0 ; i < Config::ItemsPerInvocation_1; i++)
340
- scratchAccessor.template set<scalar_t>(i* Config::SubgroupSize+ invocationIndex,lv1_val[i]);
341
- if (glsl:: gl_SubgroupInvocationID ()==Config::SubgroupSize- 1 )
340
+ scratchAccessor.template set<scalar_t>(Config::sharedCoalescedIndexByComponent ( invocationIndex, i) ,lv1_val[i]);
341
+ if (Config:: electLast () )
342
342
{
343
- const uint32_t bankedIndex = Config::sharedMemCoalescedIndex (glsl::gl_SubgroupID (), Config::ItemsPerInvocation_2); // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
343
+ const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel (glsl::gl_SubgroupID (), Config::ItemsPerInvocation_2); // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
344
344
scratchAccessor.template set<scalar_t>(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1 ]);
345
345
}
346
346
}
@@ -354,12 +354,12 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
354
354
const uint32_t prevIndex = invocationIndex-1 ;
355
355
[unroll]
356
356
for (uint32_t i = 0 ; i < Config::ItemsPerInvocation_2; i++)
357
- scratchAccessor.template get<scalar_t>(lv1_smem_size+i* Config::SubgroupSize+ prevIndex,lv2_val[i]);
357
+ scratchAccessor.template get<scalar_t>(lv1_smem_size+Config::sharedCoalescedIndexByComponent ( prevIndex, i) ,lv2_val[i]);
358
358
lv2_val[0 ] = hlsl::mix (hlsl::promote<vector_lv2_t>(BinOp::identity), lv2_val[0 ], bool (invocationIndex));
359
359
lv2_val = inclusiveScan2 (lv2_val);
360
360
[unroll]
361
361
for (uint32_t i = 0 ; i < Config::ItemsPerInvocation_2; i++)
362
- scratchAccessor.template set<scalar_t>(lv1_smem_size+i* Config::SubgroupSize+ invocationIndex,lv2_val[i]);
362
+ scratchAccessor.template set<scalar_t>(lv1_smem_size+Config::sharedCoalescedIndexByComponent ( invocationIndex, i) ,lv2_val[i]);
363
363
}
364
364
scratchAccessor.workgroupExecutionAndMemoryBarrier ();
365
365
@@ -372,12 +372,12 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
372
372
scratchAccessor.template get<scalar_t>(i*Config::SubgroupSize+invocationIndex,lv1_val[i]);
373
373
374
374
scalar_t lv2_scan;
375
- const uint32_t bankedIndex = Config::sharedMemCoalescedIndex (glsl::gl_SubgroupID (), Config::ItemsPerInvocation_2); // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
375
+ const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel (glsl::gl_SubgroupID (), Config::ItemsPerInvocation_2); // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
376
376
scratchAccessor.template set<scalar_t>(lv1_smem_size+bankedIndex, lv2_scan);
377
377
378
378
[unroll]
379
379
for (uint32_t i = 0 ; i < Config::ItemsPerInvocation_1; i++)
380
- scratchAccessor.template set<scalar_t>(i* Config::SubgroupSize+ invocationIndex, binop (lv1_val[i],lv2_scan));
380
+ scratchAccessor.template set<scalar_t>(Config::sharedCoalescedIndexByComponent ( invocationIndex, i) , binop (lv1_val[i],lv2_scan));
381
381
}
382
382
383
383
// combine with level 0
@@ -388,7 +388,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
388
388
dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
389
389
390
390
const uint32_t virtualSubgroupID = Config::virtualSubgroupID (glsl::gl_SubgroupID (), idx);
391
- const uint32_t bankedIndex = Config::sharedMemCoalescedIndex (virtualSubgroupID, Config::ItemsPerInvocation_1);
391
+ const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel (virtualSubgroupID, Config::ItemsPerInvocation_1);
392
392
scalar_t left;
393
393
scratchAccessor.template get<scalar_t>(bankedIndex,left);
394
394
if (Exclusive)
0 commit comments