@@ -118,18 +118,18 @@ class CBlitImageFilter :
118
118
public:
119
119
CState (blit_utils_t ::convolution_kernels_t && _kernels) : kernels(std::move(_kernels))
120
120
{
121
- inOffsetBaseLayer = core::vectorSIMDu32 ();
122
- inExtentLayerCount = core::vectorSIMDu32 ();
123
- outOffsetBaseLayer = core::vectorSIMDu32 ();
124
- outExtentLayerCount = core::vectorSIMDu32 ();
121
+ inOffsetBaseLayer = hlsl::uint32_t4 ();
122
+ inExtentLayerCount = hlsl::uint32_t4 ();
123
+ outOffsetBaseLayer = hlsl::uint32_t4 ();
124
+ outExtentLayerCount = hlsl::uint32_t4 ();
125
125
}
126
126
127
127
CState (const typename blit_utils_t ::convolution_kernels_t & _kernels) : kernels(_kernels)
128
128
{
129
- inOffsetBaseLayer = core::vectorSIMDu32 ();
130
- inExtentLayerCount = core::vectorSIMDu32 ();
131
- outOffsetBaseLayer = core::vectorSIMDu32 ();
132
- outExtentLayerCount = core::vectorSIMDu32 ();
129
+ inOffsetBaseLayer = hlsl::uint32_t4 ();
130
+ inExtentLayerCount = hlsl::uint32_t4 ();
131
+ outOffsetBaseLayer = hlsl::uint32_t4 ();
132
+ outExtentLayerCount = hlsl::uint32_t4 ();
133
133
}
134
134
135
135
CState (const CState& other) : IImageFilter::IState(), base_t ::CStateBase{other},
@@ -149,23 +149,23 @@ class CBlitImageFilter :
149
149
return false ;
150
150
const size_t offset = getScratchOffset (this ,ESU_SCALED_KERNEL_PHASED_LUT);
151
151
const auto inType = inImage->getCreationParameters ().type ;
152
- const size_t size = blit_utils_t::getScaledKernelPhasedLUTSize (inExtentLayerCount,outExtentLayerCount,inType,kernels);
152
+ const size_t size = blit_utils_t::getScaledKernelPhasedLUTSize (inExtentLayerCount. xyz ,outExtentLayerCount. xyz ,inType,kernels);
153
153
auto * lut = base_t ::CStateBase::scratchMemory+offset;
154
- return blit_utils_t::computeScaledKernelPhasedLUT (lut,inExtentLayerCount,outExtentLayerCount,inType, kernels);
154
+ return blit_utils_t::computeScaledKernelPhasedLUT (lut,inExtentLayerCount. xyz ,outExtentLayerCount. xyz ,inType, kernels);
155
155
}
156
156
157
157
union
158
158
{
159
- core::vectorSIMDu32 inOffsetBaseLayer;
159
+ hlsl::uint32_t4 inOffsetBaseLayer;
160
160
struct
161
161
{
162
- VkOffset3D inOffset;
163
- uint32_t inBaseLayer;
162
+ VkOffset3D inOffset;
163
+ uint32_t inBaseLayer;
164
164
};
165
165
};
166
166
union
167
167
{
168
- core::vectorSIMDu32 inExtentLayerCount;
168
+ hlsl::uint32_t4 inExtentLayerCount;
169
169
struct
170
170
{
171
171
VkExtent3D inExtent;
@@ -174,7 +174,7 @@ class CBlitImageFilter :
174
174
};
175
175
union
176
176
{
177
- core::vectorSIMDu32 outOffsetBaseLayer;
177
+ hlsl::uint32_t4 outOffsetBaseLayer;
178
178
struct
179
179
{
180
180
VkOffset3D outOffset;
@@ -183,7 +183,7 @@ class CBlitImageFilter :
183
183
};
184
184
union
185
185
{
186
- core::vectorSIMDu32 outExtentLayerCount;
186
+ hlsl::uint32_t4 outExtentLayerCount;
187
187
struct
188
188
{
189
189
VkExtent3D outExtent;
@@ -208,8 +208,7 @@ class CBlitImageFilter :
208
208
const auto windowSize = blit_utils_t::getWindowSize (inType, state->kernels );
209
209
const size_t scaledKernelPhasedLUTSize = blit_utils_t::getScaledKernelPhasedLUTSize (state->inExtentLayerCount , state->outExtentLayerCount , inType, windowSize);
210
210
211
- core::vectorSIMDi32 intermediateExtent[3 ];
212
- getIntermediateExtents (intermediateExtent, state, windowSize);
211
+ const auto intermediateExtent = getIntermediateExtents (state,windowSize);
213
212
assert (intermediateExtent[0 ].x == intermediateExtent[2 ].x );
214
213
215
214
uint32_t pingBufferElementCount = (state->inExtent .width + windowSize[0 ]) * m_maxParallelism; // decode
@@ -349,13 +348,7 @@ class CBlitImageFilter :
349
348
// filtering and alpha handling happens separately for every layer, so save on scratch memory size
350
349
const auto inImageType = inParams.type ;
351
350
const auto real_window_size = blit_utils_t::getWindowSize (inImageType,state->kernels );
352
- core::vectorSIMDi32 intermediateExtent[3 ];
353
- getIntermediateExtents (intermediateExtent, state, real_window_size);
354
- const core::vectorSIMDi32 intermediateLastCoord[3 ] = {
355
- intermediateExtent[0 ]-core::vectorSIMDi32 (1 ,1 ,1 ,0 ),
356
- intermediateExtent[1 ]-core::vectorSIMDi32 (1 ,1 ,1 ,0 ),
357
- intermediateExtent[2 ]-core::vectorSIMDi32 (1 ,1 ,1 ,0 )
358
- };
351
+ const hlsl::int32_t3x3 intermediateExtent = getIntermediateExtents (state,real_window_size);
359
352
value_t * const intermediateStorage[3 ] = {
360
353
reinterpret_cast <value_t *>(state->scratchMemory + getScratchOffset (state, ESU_BLIT_X_AXIS_WRITE)),
361
354
reinterpret_cast <value_t *>(state->scratchMemory + getScratchOffset (state, ESU_BLIT_Y_AXIS_WRITE)),
@@ -381,7 +374,7 @@ class CBlitImageFilter :
381
374
};
382
375
const std::span<const IImage::SBufferCopy> outRegions = outImg->getRegions (outMipLevel);
383
376
auto storeToImage = [policy,coverageSemantic,needsNormalization,outExtent,intermediateStorage,&sampler,outFormat,alphaRefValue,outData,intermediateStrides,alphaChannel,storeToTexel,outMipLevel,outOffset,outRegions,outImg,state](
384
- const core::rational<int64_t >& coverage, const int axis, const core::vectorSIMDu32 & outOffsetLayer
377
+ const core::rational<int64_t >& coverage, const int axis, const hlsl::uint32_t4 & outOffsetLayer
385
378
) -> void
386
379
{
387
380
assert (needsNormalization);
@@ -432,15 +425,16 @@ class CBlitImageFilter :
432
425
auto scaleCoverage = [outData,outOffsetLayer,intermediateStrides,axis,intermediateStorage,alphaChannel,coverageScale,storeToTexel](uint32_t writeBlockArrayOffset, core::vectorSIMDu32 writeBlockPos) -> void
433
426
{
434
427
void * const dstPix = outData+writeBlockArrayOffset;
435
- const core::vectorSIMDu32 localOutPos = writeBlockPos - outOffsetLayer;
428
+ for (auto i=0 ; i<4 ; i++)
429
+ writeBlockPos[i] -= outOffsetLayer[i];
436
430
437
431
value_t sample[ChannelCount];
438
- const size_t offset = IImage::SBufferCopy::getLocalByteOffset (localOutPos , intermediateStrides[axis]);
432
+ const size_t offset = IImage::SBufferCopy::getLocalByteOffset (writeBlockPos , intermediateStrides[axis]);
439
433
const auto * first = intermediateStorage[axis]+offset;
440
434
std::copy (first,first+ChannelCount,sample);
441
435
442
436
sample[alphaChannel] *= coverageScale;
443
- storeToTexel (sample,dstPix,localOutPos );
437
+ storeToTexel (sample,dstPix,writeBlockPos );
444
438
};
445
439
const ICPUImage::SSubresourceLayers subresource = {static_cast <IImage::E_ASPECT_FLAGS>(0u ),outMipLevel,outOffsetLayer.w ,1 };
446
440
const IImageFilter::IState::TexelRange range = {outOffset,outExtent};
@@ -452,32 +446,34 @@ class CBlitImageFilter :
452
446
453
447
// process
454
448
state->normalization .template initialize <double >();
455
- const core::vectorSIMDf fInExtent (inExtentLayerCount);
456
- const core::vectorSIMDf fOutExtent (outExtentLayerCount);
457
- const auto fScale = fInExtent .preciseDivision (fOutExtent );
458
- const auto halfTexelOffset = fScale *0 .5f -core::vectorSIMDf (0 .f ,0 .f ,0 .f ,0 .5f );
459
- const auto startCoord = [&halfTexelOffset,state]() -> core::vectorSIMDi32
449
+ const hlsl::float64_t3 fInExtent (inExtentLayerCount.x ,inExtentLayerCount.y ,inExtentLayerCount.z );
450
+ const hlsl::float64_t3 fOutExtent (outExtentLayerCount.x ,outExtentLayerCount.y ,outExtentLayerCount.z );
451
+ const auto fScale = hlsl::float32_t3 (fInExtent /fOutExtent );
452
+ const auto startCoord = [fScale ,state]() -> hlsl::int32_t4
460
453
{
461
- return core::vectorSIMDi32 (
454
+ const auto halfTexelOffset = fScale *0 .5f ;
455
+ return hlsl::int32_t4 (
462
456
std::get<0 >(state->kernels ).getWindowMinCoord (halfTexelOffset.x ),
463
457
std::get<1 >(state->kernels ).getWindowMinCoord (halfTexelOffset.y ),
464
- std::get<2 >(state->kernels ).getWindowMinCoord (halfTexelOffset.z ),0 );
458
+ std::get<2 >(state->kernels ).getWindowMinCoord (halfTexelOffset.z ),
459
+ 0
460
+ );
465
461
}();
466
- const auto windowMinCoordBase = inOffsetBaseLayer+startCoord;
462
+ // important we are aware of signedness here
463
+ const hlsl::int32_t4 windowMinCoordBase = hlsl::int32_t4 (inOffsetBaseLayer)+startCoord;
467
464
468
- core::vectorSIMDu32 phaseCount = IBlitUtilities::getPhaseCount (inExtentLayerCount, outExtentLayerCount, inImageType);
469
- phaseCount = core ::max (phaseCount, core::vectorSIMDu32 (1 , 1 , 1 ));
470
- const core::vectorSIMDu32 axisOffsets = blit_utils_t ::template getScaledKernelPhasedLUTAxisOffsets (phaseCount, real_window_size);
465
+ auto phaseCount = IBlitUtilities::getPhaseCount (inExtentLayerCount. xyz , outExtentLayerCount. xyz , inImageType);
466
+ phaseCount = hlsl ::max (phaseCount,hlsl::uint32_t3 (1 ,1 , 1 ));
467
+ const auto axisOffsets = blit_utils_t ::template getScaledKernelPhasedLUTAxisOffsets (phaseCount,real_window_size);
471
468
constexpr auto MaxAxisCount = 3 ;
472
469
lut_value_t * scaledKernelPhasedLUTPixel[MaxAxisCount];
473
470
for (auto i = 0 ; i < MaxAxisCount; ++i)
474
471
scaledKernelPhasedLUTPixel[i] = reinterpret_cast <lut_value_t *>(state->scratchMemory + getScratchOffset (state, ESU_SCALED_KERNEL_PHASED_LUT) + axisOffsets[i]);
475
472
476
473
for (uint32_t layer=0 ; layer!=layerCount; layer++) // TODO: could be parallelized
477
474
{
478
- const core::vectorSIMDi32 vLayer (0 ,0 ,0 ,layer);
479
- const auto windowMinCoord = windowMinCoordBase+vLayer;
480
- const auto outOffsetLayer = outOffsetBaseLayer+vLayer;
475
+ const hlsl::int32_t4 windowMinCoord (windowMinCoordBase.xyz ,windowMinCoordBase.w +layer);
476
+ const hlsl::uint32_t4 outOffsetLayer (outOffsetBaseLayer.xyz ,outOffsetBaseLayer.w +layer);
481
477
// reset coverage counter
482
478
constexpr bool is_seq_policy_v = std::is_same_v<std::remove_reference_t <ExecutionPolicy>,core::execution::sequenced_policy>;
483
479
using cond_atomic_int32_t = std::conditional_t <is_seq_policy_v,int32_t ,std::atomic_int32_t >;
@@ -517,19 +513,19 @@ class CBlitImageFilter :
517
513
uint32_t decode_offset;
518
514
// whole line plus window borders
519
515
value_t * lineBuffer;
520
- core::vectorSIMDi32 localTexCoord (0 );
516
+ hlsl::int32_t3 localTexCoord (0 , 0 , 0 );
521
517
localTexCoord[loopCoordID[0 ]] = batchCoord[0 ];
522
518
localTexCoord[loopCoordID[1 ]] = batchCoord[1 ];
523
519
if (axis!=IImage::ET_1D)
524
- lineBuffer = intermediateStorage[axis-1 ]+core ::dot (static_cast <const core::vectorSIMDi32 &>(intermediateStrides[axis-1 ]),localTexCoord)[ 0 ] ;
520
+ lineBuffer = intermediateStorage[axis-1 ]+hlsl ::dot (reinterpret_cast <const hlsl::int32_t3 &>(intermediateStrides[axis-1 ]),localTexCoord);
525
521
else
526
522
{
527
523
const auto inputEnd = inExtent.width +real_window_size.x ;
528
524
decode_offset = scratchHelper.template alloc <is_seq_policy_v>();
529
525
lineBuffer = intermediateStorage[1 ]+decode_offset*ChannelCount*inputEnd;
530
526
for (auto & i=localTexCoord.x ; i<inputEnd; i++)
531
527
{
532
- core::vectorSIMDi32 globalTexelCoord (localTexCoord+windowMinCoord);
528
+ core::vectorSIMDi32 globalTexelCoord (localTexCoord. x +windowMinCoord. x ,localTexCoord. y +windowMinCoord. y ,localTexCoord. z +windowMinCoord. z );
533
529
534
530
core::vectorSIMDu32 blockLocalTexelCoord (0u );
535
531
const void * srcPix[] = { // multiple loads for texture boundaries aren't that bad
@@ -562,11 +558,7 @@ class CBlitImageFilter :
562
558
563
559
auto getWeightedSample = [scaledKernelPhasedLUTPixel, windowSize, lineBuffer, &windowMinCoord, axis](const auto & windowCoord, const auto phaseIndex, const auto windowPixel, const auto channel) -> value_t
564
560
{
565
- value_t kernelWeight;
566
- if constexpr (std::is_same_v<lut_value_t , uint16_t >)
567
- kernelWeight = value_t (core::Float16Compressor::decompress (scaledKernelPhasedLUTPixel[axis][(phaseIndex * windowSize + windowPixel) * ChannelCount + channel]));
568
- else
569
- kernelWeight = scaledKernelPhasedLUTPixel[axis][(phaseIndex * windowSize + windowPixel) * ChannelCount + channel];
561
+ const value_t kernelWeight = static_cast <value_t >(scaledKernelPhasedLUTPixel[axis][(phaseIndex * windowSize + windowPixel) * ChannelCount + channel]);
570
562
571
563
return kernelWeight * lineBuffer[(windowCoord - windowMinCoord[axis]) * ChannelCount + channel];
572
564
};
@@ -576,11 +568,11 @@ class CBlitImageFilter :
576
568
for (auto & i=(localTexCoord[axis]=0 ); i<outExtentLayerCount[axis]; i++)
577
569
{
578
570
// get output pixel
579
- auto * const value = intermediateStorage[axis]+core ::dot (static_cast <const core::vectorSIMDi32 &>(intermediateStrides[axis]),localTexCoord)[ 0 ] ;
571
+ auto * const value = intermediateStorage[axis]+hlsl ::dot (reinterpret_cast <const hlsl::int32_t3 &>(intermediateStrides[axis]),localTexCoord);
580
572
581
573
// do the filtering
582
574
float tmp = float (i)+0 .5f ;
583
- int32_t windowCoord = kernel.getWindowMinCoord (tmp*fScale [axis], tmp);
575
+ int32_t windowCoord = kernel.getWindowMinCoord (tmp*fScale [axis],tmp);
584
576
585
577
for (auto ch = 0 ; ch < ChannelCount; ++ch)
586
578
value[ch] = getWeightedSample (windowCoord, phaseIndex, 0 , ch);
@@ -594,7 +586,12 @@ class CBlitImageFilter :
594
586
}
595
587
if (lastPass)
596
588
{
597
- const core::vectorSIMDu32 localOutPos = localTexCoord+outOffsetBaseLayer+vLayer;
589
+ const core::vectorSIMDu32 localOutPos (
590
+ outOffsetLayer.x +localTexCoord.x ,
591
+ outOffsetLayer.y +localTexCoord.y ,
592
+ outOffsetLayer.z +localTexCoord.z ,
593
+ outOffsetLayer.w +layer
594
+ );
598
595
if (needsNormalization)
599
596
state->normalization .prepass (value,localOutPos,0u ,0u ,ChannelCount);
600
597
else // store to image, we're done
@@ -678,13 +675,14 @@ class CBlitImageFilter :
678
675
std::mutex mutex;
679
676
};
680
677
681
- static inline void getIntermediateExtents (core::vectorSIMDi32* intermediateExtent, const state_type* state, const core::vectorSIMDi32& real_window_size)
678
+ // the WxHxD extent for each blit axis output
679
+ static inline hlsl::int32_t3x3 getIntermediateExtents (const state_type* state, const hlsl::int32_t3& real_window_size)
682
680
{
683
- assert ( intermediateExtent) ;
684
-
685
- intermediateExtent[0 ] = core::vectorSIMDi32 (state->outExtent .width , state->inExtent .height + real_window_size[ 1 ] , state->inExtent .depth + real_window_size[2 ]);
686
- intermediateExtent[1 ] = core::vectorSIMDi32 (state->outExtent .width , state->outExtent .height , state->inExtent .depth + real_window_size[ 2 ] );
687
- intermediateExtent[ 2 ] = core::vectorSIMDi32 (state-> outExtent . width , state-> outExtent . height , state-> outExtent . depth ) ;
681
+ hlsl::int32_t3x3 intermediateExtent;
682
+ intermediateExtent[ 0 ] = hlsl::int32_t3 (state-> outExtent . width , state-> inExtent . height + real_window_size[ 1 ], state-> inExtent . depth + real_window_size[ 2 ]);
683
+ intermediateExtent[1 ] = hlsl::int32_t3 (state->outExtent .width , state->outExtent .height , state->inExtent .depth + real_window_size[2 ]);
684
+ intermediateExtent[2 ] = hlsl::int32_t3 (state->outExtent .width , state->outExtent .height , state->outExtent .depth );
685
+ return intermediateExtent ;
688
686
}
689
687
};
690
688
0 commit comments