@@ -146,6 +146,7 @@ struct FFT<2,false, Scalar, device_capabilities>
146
146
hlsl::fft::DIF<Scalar>::radix2 (hlsl::fft::twiddle<false , Scalar>(threadID, _NBL_HLSL_WORKGROUP_SIZE_), lo, hi);
147
147
148
148
// Run bigger steps until Subgroup-sized
149
+ [unroll]
149
150
for (uint32_t stride = _NBL_HLSL_WORKGROUP_SIZE_ >> 1 ; stride > glsl::gl_SubgroupSize (); stride >>= 1 )
150
151
{
151
152
FFT_loop< MemoryAdaptor<SharedMemoryAccessor> >(stride, lo, hi, threadID, sharedmemAdaptor);
@@ -211,6 +212,7 @@ struct FFT<2,true, Scalar, device_capabilities>
211
212
fft::exchangeValues<MemoryAdaptor<SharedMemoryAccessor>, Scalar>::__call (lo, hi, threadID, glsl::gl_SubgroupSize (), sharedmemAdaptor);
212
213
213
214
// The bigger steps
215
+ [unroll]
214
216
for (uint32_t stride = glsl::gl_SubgroupSize () << 1 ; stride < _NBL_HLSL_WORKGROUP_SIZE_; stride <<= 1 )
215
217
{
216
218
// Order of waiting for shared mem writes is also reversed here, since the shuffle came earlier
@@ -241,9 +243,10 @@ struct FFT<K, false, Scalar, device_capabilities>
241
243
template<typename Accessor, typename SharedMemoryAccessor>
242
244
static enable_if_t< (mpl::is_pot_v<K> && K > 2 ), void > __call (NBL_REF_ARG (Accessor) accessor, NBL_REF_ARG (SharedMemoryAccessor) sharedmemAccessor)
243
245
{
246
+ [unroll]
244
247
for (uint32_t stride = (K / 2 ) * _NBL_HLSL_WORKGROUP_SIZE_; stride > _NBL_HLSL_WORKGROUP_SIZE_; stride >>= 1 )
245
248
{
246
- // [unroll(K/2) ]
249
+ [unroll]
247
250
for (uint32_t virtualThreadID = SubgroupContiguousIndex (); virtualThreadID < (K / 2 ) * _NBL_HLSL_WORKGROUP_SIZE_; virtualThreadID += _NBL_HLSL_WORKGROUP_SIZE_)
248
251
{
249
252
const uint32_t loIx = ((virtualThreadID & (~(stride - 1 ))) << 1 ) | (virtualThreadID & (stride - 1 ));
@@ -263,7 +266,7 @@ struct FFT<K, false, Scalar, device_capabilities>
263
266
264
267
// do K/2 small workgroup FFTs
265
268
DynamicOffsetAccessor <Accessor> offsetAccessor;
266
- // [unroll(K/2) ]
269
+ [unroll]
267
270
for (uint32_t k = 0 ; k < K; k += 2 )
268
271
{
269
272
if (k)
@@ -284,7 +287,7 @@ struct FFT<K, true, Scalar, device_capabilities>
284
287
{
285
288
// do K/2 small workgroup FFTs
286
289
DynamicOffsetAccessor <Accessor> offsetAccessor;
287
- // [unroll(K/2) ]
290
+ [unroll]
288
291
for (uint32_t k = 0 ; k < K; k += 2 )
289
292
{
290
293
if (k)
@@ -293,11 +296,12 @@ struct FFT<K, true, Scalar, device_capabilities>
293
296
FFT<2 ,true , Scalar, device_capabilities>::template __call (offsetAccessor,sharedmemAccessor);
294
297
}
295
298
accessor = offsetAccessor.accessor;
296
-
299
+
300
+ [unroll]
297
301
for (uint32_t stride = 2 * _NBL_HLSL_WORKGROUP_SIZE_; stride < K * _NBL_HLSL_WORKGROUP_SIZE_; stride <<= 1 )
298
302
{
299
303
accessor.memoryBarrier (); // no execution barrier just making sure writes propagate to accessor
300
- // [unroll(K/2) ]
304
+ [unroll]
301
305
for (uint32_t virtualThreadID = SubgroupContiguousIndex (); virtualThreadID < (K / 2 ) * _NBL_HLSL_WORKGROUP_SIZE_; virtualThreadID += _NBL_HLSL_WORKGROUP_SIZE_)
302
306
{
303
307
const uint32_t loIx = ((virtualThreadID & (~(stride - 1 ))) << 1 ) | (virtualThreadID & (stride - 1 ));
0 commit comments