@@ -200,9 +200,7 @@ class NBL_API CBlitImageFilter : public CImageFilter<CBlitImageFilter<Swizzle,Di
200
200
getIntermediateExtents (intermediateExtent, state, real_window_size);
201
201
assert (intermediateExtent[0 ].x == intermediateExtent[2 ].x );
202
202
203
- const size_t MaxParallelism = std::thread::hardware_concurrency () * VectorizationBoundSTL;
204
-
205
- uint32_t pingBufferElementCount = (state->inExtent .width + real_window_size[0 ]) * MaxParallelism; // decode
203
+ uint32_t pingBufferElementCount = (state->inExtent .width + real_window_size[0 ]) * m_maxParallelism; // decode
206
204
uint32_t pongBufferElementCount = intermediateExtent[0 ].x * intermediateExtent[0 ].y * intermediateExtent[0 ].z ; // x-axis filter output
207
205
208
206
const uint32_t yWriteElementCount = intermediateExtent[1 ].x * intermediateExtent[1 ].y * intermediateExtent[1 ].z ;
@@ -238,7 +236,7 @@ class NBL_API CBlitImageFilter : public CImageFilter<CBlitImageFilter<Swizzle,Di
238
236
{
239
237
size_t totalScratchSize = scaledKernelPhasedLUTSize + (pingBufferElementCount + pongBufferElementCount) * MaxChannels * sizeof (value_type);
240
238
if (state->alphaSemantic == asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE)
241
- totalScratchSize += kAlphaHistogramSize *MaxParallelism ;
239
+ totalScratchSize += kAlphaHistogramSize *m_maxParallelism ;
242
240
return totalScratchSize;
243
241
}
244
242
}
@@ -379,30 +377,70 @@ class NBL_API CBlitImageFilter : public CImageFilter<CBlitImageFilter<Swizzle,Di
379
377
const auto outputTexelCount = outExtent.width *outExtent.height *outExtent.depth ;
380
378
const int64_t pixelsShouldPassCount = (coverage * core::rational<int64_t >(outputTexelCount)).getIntegerApprox ();
381
379
const int64_t pixelsShouldFailCount = outputTexelCount - pixelsShouldPassCount;
382
- // all values with index<=rankIndex will be %==inverseCoverage of the overall array
383
- const int64_t rankIndex = (coverage*core::rational<int64_t >(outputTexelCount)).getIntegerApprox ()-1 ;
384
380
385
- // Todo(achal): I only use alphaBinCount*sizeof(uint32_t) here because I used atomic, switch to not using atomics.
386
- uint32_t * histogram = reinterpret_cast <uint32_t *>(state->scratchMemory + getScratchOffset (state, ESU_ALPHA_HISTOGRAM));
387
- memset (histogram, 0 , state->alphaBinCount * sizeof (uint32_t ));
381
+ uint32_t * histograms = reinterpret_cast <uint32_t *>(state->scratchMemory + getScratchOffset (state, ESU_ALPHA_HISTOGRAM));
382
+ memset (histograms, 0 , m_maxParallelism* state->alphaBinCount * sizeof (uint32_t ));
383
+
384
+ ParallelScratchHelper scratchHelper;
385
+
386
+ // uint64_t histogramIndices[VectorizationBoundSTL];
387
+ // std::fill_n(histogramIndices, VectorizationBoundSTL, ~0ull);
388
+ // std::mutex scratchLock;
389
+
390
+ constexpr bool is_seq_policy_v = std::is_same_v<std::remove_reference_t <ExecutionPolicy>, core::execution::sequenced_policy>;
388
391
389
392
struct DummyTexelType
390
393
{
391
394
double texel[MaxChannels];
392
395
};
393
- std::for_each (policy, reinterpret_cast <DummyTexelType*>(intermediateStorage[axis]), reinterpret_cast <DummyTexelType*>(intermediateStorage[axis] + outputTexelCount*MaxChannels), [&sampler, outFormat, &histogram , alphaChannel, state](const DummyTexelType& dummyTexel)
396
+ std::for_each (policy, reinterpret_cast <DummyTexelType*>(intermediateStorage[axis]), reinterpret_cast <DummyTexelType*>(intermediateStorage[axis] + outputTexelCount*MaxChannels), [&sampler, outFormat, &histograms, &scratchHelper , alphaChannel, state](const DummyTexelType& dummyTexel)
394
397
{
398
+ const uint32_t index = scratchHelper.alloc <is_seq_policy_v>();
399
+ #if 0
400
+ uint32_t index = ~0u;
401
+ {
402
+ std::unique_lock<std::mutex> lock(scratchLock);
403
+ for (uint32_t j = 0u; j < VectorizationBoundSTL; ++j)
404
+ {
405
+ int32_t firstFree = core::findLSB(histogramIndices[j]);
406
+ if (firstFree != -1)
407
+ {
408
+ index = j * 64u + firstFree;
409
+ histogramIndices[j] ^= (0x1u << firstFree); // mark using
410
+ break;
411
+ }
412
+ }
413
+ assert(false);
414
+ }
415
+ #endif
416
+
395
417
value_type texelAlpha = dummyTexel.texel [alphaChannel];
396
418
texelAlpha -= double (sampler.nextSample ()) * (asset::getFormatPrecision<value_type>(outFormat, alphaChannel, texelAlpha) / double (~0u ));
397
419
398
- const uint32_t bucketIndex = uint32_t (core::round (core::clamp (texelAlpha, 0.0 , 1.0 ) * double (state->alphaBinCount - 1 )));
399
- assert (bucketIndex < state->alphaBinCount );
400
- std::atomic_ref (histogram[bucketIndex]).fetch_add (1 );
420
+ const uint32_t binIndex = uint32_t (core::round (core::clamp (texelAlpha, 0.0 , 1.0 ) * double (state->alphaBinCount - 1 )));
421
+ assert (binIndex < state->alphaBinCount );
422
+ histograms[index*state->alphaBinCount +binIndex]++;
423
+
424
+ scratchHelper.free <is_seq_policy_v>(index);
425
+
426
+ #if 0
427
+ {
428
+ std::unique_lock<std::mutex> lock(scratchLock);
429
+ histogramIndices[index/64u] ^= (0x1u<<(index%64u)); // mark free
430
+ }
431
+ #endif
401
432
});
402
433
403
- std::inclusive_scan (histogram, histogram+state->alphaBinCount , histogram);
404
- const uint32_t bucketIndex = std::lower_bound (histogram, histogram+state->alphaBinCount , pixelsShouldFailCount) - histogram;
405
- const double newAlphaRefValue = core::min ((bucketIndex - 0.5 ) / double (state->alphaBinCount - 1 ), 1.0 );
434
+ uint32_t * mergedHistogram = histograms;
435
+ for (auto hi = 1 ; hi < m_maxParallelism; ++hi)
436
+ {
437
+ for (auto bi = 0 ; bi < state->alphaBinCount ; ++bi)
438
+ histograms[bi] += histograms[hi * state->alphaBinCount + bi];
439
+ }
440
+
441
+ std::inclusive_scan (mergedHistogram, mergedHistogram +state->alphaBinCount , mergedHistogram);
442
+ const uint32_t binIndex = std::lower_bound (mergedHistogram, mergedHistogram +state->alphaBinCount , pixelsShouldFailCount) - mergedHistogram;
443
+ const double newAlphaRefValue = core::min ((binIndex - 0.5 ) / double (state->alphaBinCount - 1 ), 1.0 );
406
444
coverageScale = alphaRefValue / newAlphaRefValue;
407
445
}
408
446
auto scaleCoverage = [outData,outOffsetLayer,intermediateStrides,axis,intermediateStorage,alphaChannel,coverageScale,storeToTexel](uint32_t writeBlockArrayOffset, core::vectorSIMDu32 writeBlockPos) -> void
@@ -488,6 +526,8 @@ class NBL_API CBlitImageFilter : public CImageFilter<CBlitImageFilter<Swizzle,Di
488
526
const int loopCoordID[2 ] = {/* axis,*/ axis!=IImage::ET_2D ? 1 :0 ,axis!=IImage::ET_3D ? 2 :0 };
489
527
//
490
528
assert (is_seq_policy_v || std::thread::hardware_concurrency ()<=64u );
529
+ ParallelScratchHelper scratchHelper;
530
+ #if 0
491
531
uint64_t decodeScratchAllocs[VectorizationBoundSTL];
492
532
std::fill_n(decodeScratchAllocs,VectorizationBoundSTL,~0u);
493
533
std::mutex scratchLock;
@@ -519,6 +559,7 @@ class NBL_API CBlitImageFilter : public CImageFilter<CBlitImageFilter<Swizzle,Di
519
559
}
520
560
};
521
561
//
562
+ #endif
522
563
constexpr uint32_t batch_dims = 2u ;
523
564
const uint32_t batchExtent[batch_dims] = {
524
565
static_cast <uint32_t >(intermediateExtent[axis][loopCoordID[0 ]]),
@@ -529,6 +570,8 @@ class NBL_API CBlitImageFilter : public CImageFilter<CBlitImageFilter<Swizzle,Di
529
570
CBasicImageFilterCommon::BlockIterator<batch_dims> end (begin.getExtentBatches (),spaceFillingEnd);
530
571
std::for_each (policy,begin,end,[&](const std::array<uint32_t ,batch_dims>& batchCoord) -> void
531
572
{
573
+ constexpr bool is_seq_policy_v = std::is_same_v<std::remove_reference_t <ExecutionPolicy>, core::execution::sequenced_policy>;
574
+
532
575
// we need some tmp memory for threads in the first pass so that they dont step on each other
533
576
uint32_t decode_offset;
534
577
// whole line plus window borders
@@ -541,7 +584,8 @@ class NBL_API CBlitImageFilter : public CImageFilter<CBlitImageFilter<Swizzle,Di
541
584
else
542
585
{
543
586
const auto inputEnd = inExtent.width +real_window_size.x ;
544
- decode_offset = alloc_decode_scratch ();
587
+ // decode_offset = alloc_decode_scratch();
588
+ decode_offset = scratchHelper.alloc <is_seq_policy_v>();
545
589
lineBuffer = intermediateStorage[1 ]+decode_offset*MaxChannels*inputEnd;
546
590
for (auto & i=localTexCoord.x ; i<inputEnd; i++)
547
591
{
@@ -627,8 +671,9 @@ class NBL_API CBlitImageFilter : public CImageFilter<CBlitImageFilter<Swizzle,Di
627
671
if (++phaseIndex == phaseCount[axis])
628
672
phaseIndex = 0 ;
629
673
}
630
- if (axis==IImage::ET_1D)
631
- free_decode_scratch (decode_offset);
674
+ if (axis == IImage::ET_1D)
675
+ scratchHelper.free <is_seq_policy_v>(decode_offset);
676
+ // free_decode_scratch(decode_offset);
632
677
});
633
678
// we'll only get here if we have to do coverage adjustment
634
679
if (needsNormalization && lastPass)
@@ -650,7 +695,53 @@ class NBL_API CBlitImageFilter : public CImageFilter<CBlitImageFilter<Swizzle,Di
650
695
651
696
private:
652
697
static inline constexpr uint32_t VectorizationBoundSTL = /* AVX2*/ 16u ;
653
- //
698
+ static inline const uint32_t m_maxParallelism = std::thread::hardware_concurrency() * VectorizationBoundSTL;
699
+
700
+ class ParallelScratchHelper
701
+ {
702
+ public:
703
+ ParallelScratchHelper ()
704
+ {
705
+ std::fill_n (indices, VectorizationBoundSTL, ~0ull );
706
+ }
707
+
708
+ template <bool isSeqPolicy>
709
+ inline uint32_t alloc ()
710
+ {
711
+ if constexpr (isSeqPolicy)
712
+ return 0 ;
713
+
714
+ std::unique_lock<std::mutex> lock (mutex);
715
+ for (uint32_t j = 0u ; j < VectorizationBoundSTL; ++j)
716
+ {
717
+ int32_t firstFree = core::findLSB (indices[j]);
718
+ if (firstFree != -1 )
719
+ {
720
+ indices[j] ^= (0x1u << firstFree); // mark using
721
+ return j * MaxCores + firstFree;
722
+ }
723
+ }
724
+ assert (false );
725
+ return 0xdeadbeef ;
726
+ }
727
+
728
+ template <bool isSeqPolicy>
729
+ inline void free (const uint32_t index)
730
+ {
731
+ if constexpr (!isSeqPolicy)
732
+ {
733
+ std::unique_lock<std::mutex> lock (mutex);
734
+ indices[index / MaxCores] ^= (0x1u << (index % MaxCores)); // mark free
735
+ }
736
+ }
737
+
738
+ private:
739
+ static inline constexpr auto MaxCores = 64 ;
740
+
741
+ uint64_t indices[VectorizationBoundSTL];
742
+ std::mutex mutex;
743
+ };
744
+
654
745
static inline void getIntermediateExtents (core::vectorSIMDi32* intermediateExtent, const state_type* state, const core::vectorSIMDi32& real_window_size)
655
746
{
656
747
assert (intermediateExtent);
0 commit comments