Skip to content

Commit 422c7e3

Browse files
committed
Remove atomic use from multithreaded histogram building in CBlitImageFilter. Abstract out common parts from alloc_decode_scratch. Intentionally committing some dead code so that it will be checked in the source control and it will be easier for us to tear down the abstraction later if we don't want it.
1 parent 3e87514 commit 422c7e3

File tree

1 file changed

+111
-20
lines changed

1 file changed

+111
-20
lines changed

include/nbl/asset/filters/CBlitImageFilter.h

Lines changed: 111 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -200,9 +200,7 @@ class NBL_API CBlitImageFilter : public CImageFilter<CBlitImageFilter<Swizzle,Di
200200
getIntermediateExtents(intermediateExtent, state, real_window_size);
201201
assert(intermediateExtent[0].x == intermediateExtent[2].x);
202202

203-
const size_t MaxParallelism = std::thread::hardware_concurrency() * VectorizationBoundSTL;
204-
205-
uint32_t pingBufferElementCount = (state->inExtent.width + real_window_size[0]) * MaxParallelism; // decode
203+
uint32_t pingBufferElementCount = (state->inExtent.width + real_window_size[0]) * m_maxParallelism; // decode
206204
uint32_t pongBufferElementCount = intermediateExtent[0].x * intermediateExtent[0].y * intermediateExtent[0].z; // x-axis filter output
207205

208206
const uint32_t yWriteElementCount = intermediateExtent[1].x * intermediateExtent[1].y * intermediateExtent[1].z;
@@ -238,7 +236,7 @@ class NBL_API CBlitImageFilter : public CImageFilter<CBlitImageFilter<Swizzle,Di
238236
{
239237
size_t totalScratchSize = scaledKernelPhasedLUTSize + (pingBufferElementCount + pongBufferElementCount) * MaxChannels * sizeof(value_type);
240238
if (state->alphaSemantic == asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE)
241-
totalScratchSize += kAlphaHistogramSize*MaxParallelism;
239+
totalScratchSize += kAlphaHistogramSize*m_maxParallelism;
242240
return totalScratchSize;
243241
}
244242
}
@@ -379,30 +377,70 @@ class NBL_API CBlitImageFilter : public CImageFilter<CBlitImageFilter<Swizzle,Di
379377
const auto outputTexelCount = outExtent.width*outExtent.height*outExtent.depth;
380378
const int64_t pixelsShouldPassCount = (coverage * core::rational<int64_t>(outputTexelCount)).getIntegerApprox();
381379
const int64_t pixelsShouldFailCount = outputTexelCount - pixelsShouldPassCount;
382-
// all values with index<=rankIndex will be %==inverseCoverage of the overall array
383-
const int64_t rankIndex = (coverage*core::rational<int64_t>(outputTexelCount)).getIntegerApprox()-1;
384380

385-
// Todo(achal): I only use alphaBinCount*sizeof(uint32_t) here because I used atomic, switch to not using atomics.
386-
uint32_t* histogram = reinterpret_cast<uint32_t*>(state->scratchMemory + getScratchOffset(state, ESU_ALPHA_HISTOGRAM));
387-
memset(histogram, 0, state->alphaBinCount * sizeof(uint32_t));
381+
uint32_t* histograms = reinterpret_cast<uint32_t*>(state->scratchMemory + getScratchOffset(state, ESU_ALPHA_HISTOGRAM));
382+
memset(histograms, 0, m_maxParallelism* state->alphaBinCount * sizeof(uint32_t));
383+
384+
ParallelScratchHelper scratchHelper;
385+
386+
// uint64_t histogramIndices[VectorizationBoundSTL];
387+
// std::fill_n(histogramIndices, VectorizationBoundSTL, ~0ull);
388+
// std::mutex scratchLock;
389+
390+
constexpr bool is_seq_policy_v = std::is_same_v<std::remove_reference_t<ExecutionPolicy>, core::execution::sequenced_policy>;
388391

389392
struct DummyTexelType
390393
{
391394
double texel[MaxChannels];
392395
};
393-
std::for_each(policy, reinterpret_cast<DummyTexelType*>(intermediateStorage[axis]), reinterpret_cast<DummyTexelType*>(intermediateStorage[axis] + outputTexelCount*MaxChannels), [&sampler, outFormat, &histogram, alphaChannel, state](const DummyTexelType& dummyTexel)
396+
std::for_each(policy, reinterpret_cast<DummyTexelType*>(intermediateStorage[axis]), reinterpret_cast<DummyTexelType*>(intermediateStorage[axis] + outputTexelCount*MaxChannels), [&sampler, outFormat, &histograms, &scratchHelper, alphaChannel, state](const DummyTexelType& dummyTexel)
394397
{
398+
const uint32_t index = scratchHelper.alloc<is_seq_policy_v>();
399+
#if 0
400+
uint32_t index = ~0u;
401+
{
402+
std::unique_lock<std::mutex> lock(scratchLock);
403+
for (uint32_t j = 0u; j < VectorizationBoundSTL; ++j)
404+
{
405+
int32_t firstFree = core::findLSB(histogramIndices[j]);
406+
if (firstFree != -1)
407+
{
408+
index = j * 64u + firstFree;
409+
histogramIndices[j] ^= (0x1u << firstFree); // mark using
410+
break;
411+
}
412+
}
413+
assert(false);
414+
}
415+
#endif
416+
395417
value_type texelAlpha = dummyTexel.texel[alphaChannel];
396418
texelAlpha -= double(sampler.nextSample()) * (asset::getFormatPrecision<value_type>(outFormat, alphaChannel, texelAlpha) / double(~0u));
397419

398-
const uint32_t bucketIndex = uint32_t(core::round(core::clamp(texelAlpha, 0.0, 1.0) * double(state->alphaBinCount - 1)));
399-
assert(bucketIndex < state->alphaBinCount);
400-
std::atomic_ref(histogram[bucketIndex]).fetch_add(1);
420+
const uint32_t binIndex = uint32_t(core::round(core::clamp(texelAlpha, 0.0, 1.0) * double(state->alphaBinCount - 1)));
421+
assert(binIndex < state->alphaBinCount);
422+
histograms[index*state->alphaBinCount+binIndex]++;
423+
424+
scratchHelper.free<is_seq_policy_v>(index);
425+
426+
#if 0
427+
{
428+
std::unique_lock<std::mutex> lock(scratchLock);
429+
histogramIndices[index/64u] ^= (0x1u<<(index%64u)); // mark free
430+
}
431+
#endif
401432
});
402433

403-
std::inclusive_scan(histogram, histogram+state->alphaBinCount, histogram);
404-
const uint32_t bucketIndex = std::lower_bound(histogram, histogram+state->alphaBinCount, pixelsShouldFailCount) - histogram;
405-
const double newAlphaRefValue = core::min((bucketIndex - 0.5) / double(state->alphaBinCount - 1), 1.0);
434+
uint32_t* mergedHistogram = histograms;
435+
for (auto hi = 1; hi < m_maxParallelism; ++hi)
436+
{
437+
for (auto bi = 0; bi < state->alphaBinCount; ++bi)
438+
histograms[bi] += histograms[hi * state->alphaBinCount + bi];
439+
}
440+
441+
std::inclusive_scan(mergedHistogram, mergedHistogram +state->alphaBinCount, mergedHistogram);
442+
const uint32_t binIndex = std::lower_bound(mergedHistogram, mergedHistogram +state->alphaBinCount, pixelsShouldFailCount) - mergedHistogram;
443+
const double newAlphaRefValue = core::min((binIndex - 0.5) / double(state->alphaBinCount - 1), 1.0);
406444
coverageScale = alphaRefValue / newAlphaRefValue;
407445
}
408446
auto scaleCoverage = [outData,outOffsetLayer,intermediateStrides,axis,intermediateStorage,alphaChannel,coverageScale,storeToTexel](uint32_t writeBlockArrayOffset, core::vectorSIMDu32 writeBlockPos) -> void
@@ -488,6 +526,8 @@ class NBL_API CBlitImageFilter : public CImageFilter<CBlitImageFilter<Swizzle,Di
488526
const int loopCoordID[2] = {/*axis,*/axis!=IImage::ET_2D ? 1:0,axis!=IImage::ET_3D ? 2:0};
489527
//
490528
assert(is_seq_policy_v || std::thread::hardware_concurrency()<=64u);
529+
ParallelScratchHelper scratchHelper;
530+
#if 0
491531
uint64_t decodeScratchAllocs[VectorizationBoundSTL];
492532
std::fill_n(decodeScratchAllocs,VectorizationBoundSTL,~0u);
493533
std::mutex scratchLock;
@@ -519,6 +559,7 @@ class NBL_API CBlitImageFilter : public CImageFilter<CBlitImageFilter<Swizzle,Di
519559
}
520560
};
521561
//
562+
#endif
522563
constexpr uint32_t batch_dims = 2u;
523564
const uint32_t batchExtent[batch_dims] = {
524565
static_cast<uint32_t>(intermediateExtent[axis][loopCoordID[0]]),
@@ -529,6 +570,8 @@ class NBL_API CBlitImageFilter : public CImageFilter<CBlitImageFilter<Swizzle,Di
529570
CBasicImageFilterCommon::BlockIterator<batch_dims> end(begin.getExtentBatches(),spaceFillingEnd);
530571
std::for_each(policy,begin,end,[&](const std::array<uint32_t,batch_dims>& batchCoord) -> void
531572
{
573+
constexpr bool is_seq_policy_v = std::is_same_v<std::remove_reference_t<ExecutionPolicy>, core::execution::sequenced_policy>;
574+
532575
// we need some tmp memory for threads in the first pass so that they dont step on each other
533576
uint32_t decode_offset;
534577
// whole line plus window borders
@@ -541,7 +584,8 @@ class NBL_API CBlitImageFilter : public CImageFilter<CBlitImageFilter<Swizzle,Di
541584
else
542585
{
543586
const auto inputEnd = inExtent.width+real_window_size.x;
544-
decode_offset = alloc_decode_scratch();
587+
// decode_offset = alloc_decode_scratch();
588+
decode_offset = scratchHelper.alloc<is_seq_policy_v>();
545589
lineBuffer = intermediateStorage[1]+decode_offset*MaxChannels*inputEnd;
546590
for (auto& i=localTexCoord.x; i<inputEnd; i++)
547591
{
@@ -627,8 +671,9 @@ class NBL_API CBlitImageFilter : public CImageFilter<CBlitImageFilter<Swizzle,Di
627671
if (++phaseIndex == phaseCount[axis])
628672
phaseIndex = 0;
629673
}
630-
if (axis==IImage::ET_1D)
631-
free_decode_scratch(decode_offset);
674+
if (axis == IImage::ET_1D)
675+
scratchHelper.free<is_seq_policy_v>(decode_offset);
676+
// free_decode_scratch(decode_offset);
632677
});
633678
// we'll only get here if we have to do coverage adjustment
634679
if (needsNormalization && lastPass)
@@ -650,7 +695,53 @@ class NBL_API CBlitImageFilter : public CImageFilter<CBlitImageFilter<Swizzle,Di
650695

651696
private:
652697
static inline constexpr uint32_t VectorizationBoundSTL = /*AVX2*/16u;
653-
//
698+
static inline const uint32_t m_maxParallelism = std::thread::hardware_concurrency() * VectorizationBoundSTL;
699+
700+
class ParallelScratchHelper
701+
{
702+
public:
703+
ParallelScratchHelper()
704+
{
705+
std::fill_n(indices, VectorizationBoundSTL, ~0ull);
706+
}
707+
708+
template<bool isSeqPolicy>
709+
inline uint32_t alloc()
710+
{
711+
if constexpr (isSeqPolicy)
712+
return 0;
713+
714+
std::unique_lock<std::mutex> lock(mutex);
715+
for (uint32_t j = 0u; j < VectorizationBoundSTL; ++j)
716+
{
717+
int32_t firstFree = core::findLSB(indices[j]);
718+
if (firstFree != -1)
719+
{
720+
indices[j] ^= (0x1u << firstFree); // mark using
721+
return j * MaxCores + firstFree;
722+
}
723+
}
724+
assert(false);
725+
return 0xdeadbeef;
726+
}
727+
728+
template<bool isSeqPolicy>
729+
inline void free(const uint32_t index)
730+
{
731+
if constexpr (!isSeqPolicy)
732+
{
733+
std::unique_lock<std::mutex> lock(mutex);
734+
indices[index / MaxCores] ^= (0x1u << (index % MaxCores)); // mark free
735+
}
736+
}
737+
738+
private:
739+
static inline constexpr auto MaxCores = 64;
740+
741+
uint64_t indices[VectorizationBoundSTL];
742+
std::mutex mutex;
743+
};
744+
654745
static inline void getIntermediateExtents(core::vectorSIMDi32* intermediateExtent, const state_type* state, const core::vectorSIMDi32& real_window_size)
655746
{
656747
assert(intermediateExtent);

0 commit comments

Comments
 (0)