Skip to content

Commit 4f257ba

Browse files
Workgroup Scans no longer stretch DXC's sema to the limit
P.S. fix missing `c_str()`
1 parent b79129f commit 4f257ba

File tree

3 files changed

+24
-22
lines changed

3 files changed

+24
-22
lines changed

examples_tests

include/nbl/builtin/hlsl/workgroup/shared_scan.hlsl

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -78,81 +78,83 @@ struct reduce
7878
};
7979

8080
template<class BinOp, bool Exclusive, uint16_t ItemCount>
81-
struct scan : reduce<BinOp,ItemCount>
81+
struct scan// : reduce<BinOp,ItemCount> https://github.com/microsoft/DirectXShaderCompiler/issues/5966
8282
{
8383
using base_t = reduce<BinOp,ItemCount>;
84+
base_t __base;
8485
using type_t = typename base_t::type_t;
8586

8687
template<class Accessor>
8788
type_t __call(NBL_CONST_REF_ARG(type_t) value, NBL_REF_ARG(Accessor) scratchAccessor)
8889
{
89-
base_t::template __call<Accessor>(value,scratchAccessor);
90+
__base.template __call<Accessor>(value,scratchAccessor);
9091

9192
const uint16_t subgroupID = uint16_t(glsl::gl_SubgroupID());
9293
// abuse integer wraparound to map 0 to 0xffffu
9394
const uint16_t prevSubgroupID = subgroupID-1;
9495

9596
// important check to prevent weird `firstbithigh` overlflows
96-
if(base_t::lastInvocation>=uint16_t(glsl::gl_SubgroupSize()))
97+
const uint16_t lastInvocation = ItemCount-1;
98+
if(lastInvocation>=uint16_t(glsl::gl_SubgroupSize()))
9799
{
98100
const uint16_t subgroupSizeLog2 = uint16_t(glsl::gl_SubgroupSizeLog2());
99101
// different than Upsweep cause we need to translate high level inclusive scans into exclusive on the fly, so we get the value of the subgroup behind our own in each level
100102
const uint16_t storeLoadIndexDiff = SubgroupContiguousIndex()-prevSubgroupID;
101103

102104
BinOp binop;
103105
// because DXC doesn't do references and I need my "frozen" registers
104-
#define scanStoreIndex base_t::scanLoadIndex
106+
#define scanStoreIndex __base.scanLoadIndex
105107
// we sloop over levels from highest to penultimate
106108
// as we iterate some previously active (higher level) invocations hold their exclusive prefix sum in `lastLevelScan`
107-
const uint16_t temp = firstbithigh(base_t::lastInvocation)/subgroupSizeLog2; // doing division then multiplication might be optimized away by the compiler
108-
const uint16_t initialLogShift = temp * subgroupSizeLog2;
109+
const uint16_t temp = uint16_t(firstbithigh(uint32_t(lastInvocation))/subgroupSizeLog2); // doing division then multiplication might be optimized away by the compiler
110+
const uint16_t initialLogShift = temp*subgroupSizeLog2;
109111
// TODO: later [unroll(scan_levels<ItemCount,MinSubgroupSize>::value-1)]
110112
[unroll(1)]
111113
for (uint16_t logShift=initialLogShift; bool(logShift); logShift-=subgroupSizeLog2)
112114
{
113115
// on the first iteration gl_SubgroupID==0 will participate but not afterwards because binop operand is identity
114-
if (base_t::participate)
116+
if (__base.participate)
115117
{
116118
// we need to add the higher level invocation exclusive prefix sum to current value
117119
if (logShift!=initialLogShift) // but the top level doesn't have any level above itself
118120
{
119121
// this is fine if on the way up you also += under `if (participate)`
120-
scanStoreIndex -= base_t::lastInvocationInLevel+1;
121-
base_t::lastLevelScan = binop(base_t::lastLevelScan,scratchAccessor.get(scanStoreIndex));
122+
scanStoreIndex -= __base.lastInvocationInLevel+1;
123+
__base.lastLevelScan = binop(__base.lastLevelScan,scratchAccessor.get(scanStoreIndex));
122124
}
123125
// now `lastLevelScan` has current level's inclusive prefux sum computed properly
124126
// note we're overwriting the same location with same invocation so no barrier needed
125127
// we store everything even though we'll never use the last entry due to shuffleup on read
126-
scratchAccessor.set(scanStoreIndex,base_t::lastLevelScan);
128+
scratchAccessor.set(scanStoreIndex,__base.lastLevelScan);
127129
}
128130
scratchAccessor.workgroupExecutionAndMemoryBarrier();
129131
// we're sneaky and exclude `gl_SubgroupID==0` from participation by abusing integer underflow
130-
base_t::participate = prevSubgroupID<base_t::lastInvocationInLevel;
131-
if (base_t::participate)
132+
__base.participate = prevSubgroupID<__base.lastInvocationInLevel;
133+
if (__base.participate)
132134
{
133135
// we either need to prevent OOB read altogether OR cmov identity after the far
134-
base_t::lastLevelScan = scratchAccessor.get(scanStoreIndex-storeLoadIndexDiff);
136+
__base.lastLevelScan = scratchAccessor.get(scanStoreIndex-storeLoadIndexDiff);
135137
}
136-
base_t::lastInvocationInLevel = base_t::lastInvocation>>logShift;
138+
__base.lastInvocationInLevel = lastInvocation>>logShift;
137139
}
138140
#undef scanStoreIndex
139141

140-
//assert((base_t::lastInvocation>>subgroupSizeLog2)==base_t::lastInvocationInLevel);
142+
//assert((__base.lastInvocation>>subgroupSizeLog2)==__base.lastInvocationInLevel);
141143

142144
// the very first prefix sum we did is in a register, not Accessor scratch mem hence the special path
143-
if (prevSubgroupID<base_t::lastInvocationInLevel)
144-
base_t::firstLevelScan = binop(base_t::lastLevelScan,base_t::firstLevelScan);
145+
if (prevSubgroupID<__base.lastInvocationInLevel)
146+
__base.firstLevelScan = binop(__base.lastLevelScan,__base.firstLevelScan);
145147
}
146148

147149
if(Exclusive)
148150
{
149-
base_t::firstLevelScan = glsl::subgroupShuffleUp(base_t::firstLevelScan,1);
151+
__base.firstLevelScan = glsl::subgroupShuffleUp(__base.firstLevelScan,1);
150152
// shuffle doesn't work between subgroups but the value for each elected subgroup invocation is just the previous higherLevelExclusive
151153
// note that we assume we might have to do scans with itemCount <= gl_WorkgroupSize
152154
if (glsl::subgroupElect())
153-
base_t::firstLevelScan = bool(subgroupID) ? base_t::lastLevelScan:BinOp::identity;
155+
__base.firstLevelScan = bool(subgroupID) ? __base.lastLevelScan:BinOp::identity;
154156
}
155-
return base_t::firstLevelScan;
157+
return __base.firstLevelScan;
156158
}
157159
};
158160
}

src/nbl/asset/utils/CGLSLCompiler.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ std::string CGLSLCompiler::preprocessShader(std::string&& code, IShader::E_SHADE
151151
auto res = comp.PreprocessGlsl(code, scstage, preprocessOptions.sourceIdentifier.data(), options);
152152

153153
if (res.GetCompilationStatus() != shaderc_compilation_status_success) {
154-
preprocessOptions.logger.log("%s\n", system::ILogger::ELL_ERROR, res.GetErrorMessage());
154+
preprocessOptions.logger.log("%s\n", system::ILogger::ELL_ERROR, res.GetErrorMessage().c_str());
155155
return nullptr;
156156
}
157157

0 commit comments

Comments
 (0)