Skip to content

Commit adb35d6

Browse files
committed
cached inCPUImages for filters
1 parent b4068ae commit adb35d6

File tree

2 files changed

+85
-57
lines changed

2 files changed

+85
-57
lines changed

include/nbl/video/utilities/IUtilities.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1035,6 +1035,9 @@ class NBL_API ImageRegionIterator
10351035
private:
10361036
core::SRange<const asset::IImage::SBufferCopy> regions;
10371037

1038+
// Mock CPU Images used to copy cpu buffer to staging buffer
1039+
std::vector<core::smart_refctd_ptr<asset::ICPUImage>> imageFilterInCPUImages;
1040+
10381041
bool canTransferMipLevelsPartially = false;
10391042
asset::VkExtent3D minImageTransferGranularity = {};
10401043
uint32_t bufferOffsetAlignment = 1u;

src/nbl/video/utilities/IUtilities.cpp

Lines changed: 82 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -229,19 +229,69 @@ ImageRegionIterator::ImageRegionIterator(
229229
// [x] If Queue doesn't support GRAPHICS_BIT or COMPUTE_BIT -> must be multiple of 4
230230
// [x] bufferOffset must be a multiple of texel block size in bytes
231231
bufferOffsetAlignment = dstImageTexelBlockInfo.getBlockByteSize(); // can be non power of two
232-
if(asset::isDepthOrStencilFormat(dstImageFormat))
232+
if (asset::isDepthOrStencilFormat(dstImageFormat))
233233
bufferOffsetAlignment = std::lcm(bufferOffsetAlignment, 4u);
234234

235235
bool queueSupportsCompute = queueFamilyProps.queueFlags.hasFlags(IPhysicalDevice::EQF_COMPUTE_BIT);
236236
bool queueSupportsGraphics = queueFamilyProps.queueFlags.hasFlags(IPhysicalDevice::EQF_GRAPHICS_BIT);
237-
if((queueSupportsGraphics || queueSupportsCompute) == false)
237+
if ((queueSupportsGraphics || queueSupportsCompute) == false)
238238
bufferOffsetAlignment = std::lcm(bufferOffsetAlignment, 4u);
239239
// TODO: Need to have a function to get equivalent format of the specific plane of this format (in aspectMask)
240240
// if(asset::isPlanarFormat(dstImageFormat->getCreationParameters().format))
241241

242242
// Queues supporting graphics and/or compute operations must report (1,1,1) in minImageTransferGranularity, meaning that there are no additional restrictions on the granularity of image transfer operations for these queues.
243243
// Other queues supporting image transfer operations are only required to support whole mip level transfers, thus minImageTransferGranularity for queues belonging to such queue families may be (0,0,0)
244244
canTransferMipLevelsPartially = !(minImageTransferGranularity.width == 0 && minImageTransferGranularity.height == 0 && minImageTransferGranularity.depth == 0);
245+
246+
auto dstImageParams = dstImage->getCreationParameters();
247+
248+
/*
249+
We have to first construct two `ICPUImage`s per Region named `inCPUImage` and `outCPUImage`
250+
Then we will create fake ICPUBuffers that point to srcBuffer and stagingBuffer with correct offsets
251+
Then we have to set the buffer and regions for each one of those ICPUImages using setBufferAndRegions
252+
Finally we fill the filter state and `execute` which require in/out CPUImages
253+
*/
254+
255+
imageFilterInCPUImages.resize(regions.size());
256+
// imageFilterOutCPUImages.resize(regions.size());
257+
for (uint32_t i = 0; i < copyRegions.size(); ++i)
258+
{
259+
auto& inCPUImage = imageFilterInCPUImages[i];
260+
const auto region = regions[i];
261+
// inCPUImage is an image matching the params of dstImage but with the extents and layer count of the current region being copied and mipLevel 1u and the format being srcImageFormat
262+
// the buffer of this image is set to (srcBuffer+Offset) and the related region is set to cover the whole copy region (offset from 0)
263+
auto inCpuImageRegionsDynArray = core::make_refctd_dynamic_array<core::smart_refctd_dynamic_array<asset::ICPUImage::SBufferCopy>>(1);
264+
auto& inCpuImageRegion = inCpuImageRegionsDynArray->front();
265+
inCpuImageRegion = {};
266+
inCpuImageRegion.bufferOffset = 0u;
267+
inCpuImageRegion.bufferRowLength = region.bufferRowLength;
268+
inCpuImageRegion.bufferImageHeight = region.bufferImageHeight;
269+
inCpuImageRegion.imageSubresource.aspectMask = region.imageSubresource.aspectMask;
270+
inCpuImageRegion.imageSubresource.mipLevel = 0u;
271+
inCpuImageRegion.imageSubresource.baseArrayLayer = 0u;
272+
inCpuImageRegion.imageOffset.x = 0u;
273+
inCpuImageRegion.imageOffset.y = 0u;
274+
inCpuImageRegion.imageOffset.z = 0u;
275+
inCpuImageRegion.imageExtent.width = region.imageExtent.width;
276+
inCpuImageRegion.imageExtent.height = region.imageExtent.height;
277+
inCpuImageRegion.imageExtent.depth = region.imageExtent.depth;
278+
inCpuImageRegion.imageSubresource.layerCount = region.imageSubresource.layerCount;
279+
280+
uint64_t offsetInCPUBuffer = region.bufferOffset;
281+
uint8_t* inCpuBufferPointer = const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(srcBuffer->getPointer()) + offsetInCPUBuffer);
282+
asset::ICPUImage::SCreationParams inCPUImageParams = dstImageParams;
283+
inCPUImageParams.flags = asset::IImage::ECF_NONE; // Because we may want to write to first few layers of CUBEMAP (<6) but it's not valid to create an Cube ICPUImage with less that 6 layers.
284+
inCPUImageParams.format = srcImageFormat;
285+
inCPUImageParams.extent = region.imageExtent;
286+
inCPUImageParams.arrayLayers = region.imageSubresource.layerCount;
287+
inCPUImageParams.mipLevels = 1u; // since we copy one mip at a time to our dst image, it doesn't matter at the stage when we copy from cpu memory to staging memory
288+
inCPUImage = asset::ICPUImage::create(std::move(inCPUImageParams));
289+
assert(inCPUImage);
290+
core::smart_refctd_ptr<asset::ICPUBuffer> inCPUBuffer = core::make_smart_refctd_ptr< asset::CCustomAllocatorCPUBuffer<core::null_allocator<uint8_t>, true> >(srcBuffer->getSize(), inCpuBufferPointer, core::adopt_memory);
291+
inCPUImage->setBufferAndRegions(std::move(inCPUBuffer), inCpuImageRegionsDynArray);
292+
assert(inCPUImage->getBuffer());
293+
assert(inCPUImage->getRegions().size() > 0u);
294+
}
245295
}
246296

247297
size_t ImageRegionIterator::getMemoryNeededForRemainingRegions() const
@@ -347,6 +397,8 @@ struct PromotionComponentSwizzle
347397

348398
template<typename Filter>
349399
bool performCopyUsingImageFilter(
400+
const core::vector4du32_SIMD& inOffsetBaseLayer,
401+
const core::vector4du32_SIMD& ouOffsetBaseLayer,
350402
const core::smart_refctd_ptr<asset::ICPUImage>& inCPUImage,
351403
const core::smart_refctd_ptr<asset::ICPUImage>& outCPUImage,
352404
const asset::IImage::SBufferCopy& region)
@@ -357,7 +409,7 @@ bool performCopyUsingImageFilter(
357409
state.layerCount = region.imageSubresource.layerCount;
358410
state.inImage = inCPUImage.get();
359411
state.outImage = outCPUImage.get();
360-
state.inOffsetBaseLayer = core::vectorSIMDu32(0u);
412+
state.inOffsetBaseLayer = inOffsetBaseLayer;
361413
state.outOffsetBaseLayer = core::vectorSIMDu32(0u);
362414
state.inMipLevel = 0u;
363415
state.outMipLevel = 0u;
@@ -368,29 +420,31 @@ bool performCopyUsingImageFilter(
368420
return false;
369421
}
370422

371-
bool performCopy(
423+
bool performIntermediateCopy(
372424
asset::E_FORMAT srcImageFormat,
373425
asset::E_FORMAT dstImageFormat,
426+
const core::vector4du32_SIMD& inOffsetBaseLayer,
427+
const core::vector4du32_SIMD& outOffsetBaseLayer,
374428
const core::smart_refctd_ptr<asset::ICPUImage>& inCPUImage,
375429
const core::smart_refctd_ptr<asset::ICPUImage>& outCPUImage,
376430
const asset::IImage::SBufferCopy& region)
377431
{
378432
// In = srcBuffer, Out = stagingBuffer
379433
if (srcImageFormat == dstImageFormat)
380434
{
381-
return performCopyUsingImageFilter<asset::CCopyImageFilter>(inCPUImage, outCPUImage, region);
435+
return performCopyUsingImageFilter<asset::CCopyImageFilter>(inOffsetBaseLayer, outOffsetBaseLayer, inCPUImage, outCPUImage, region);
382436
}
383437
else
384438
{
385439
auto srcChannelCount = asset::getFormatChannelCount(srcImageFormat);
386440
if (srcChannelCount == 1u)
387-
performCopyUsingImageFilter<asset::CSwizzleAndConvertImageFilter<asset::EF_UNKNOWN, asset::EF_UNKNOWN, PromotionComponentSwizzle<1u>>>(inCPUImage, outCPUImage, region);
441+
performCopyUsingImageFilter<asset::CSwizzleAndConvertImageFilter<asset::EF_UNKNOWN, asset::EF_UNKNOWN, PromotionComponentSwizzle<1u>>>(inOffsetBaseLayer, outOffsetBaseLayer, inCPUImage, outCPUImage, region);
388442
else if (srcChannelCount == 2u)
389-
performCopyUsingImageFilter<asset::CSwizzleAndConvertImageFilter<asset::EF_UNKNOWN, asset::EF_UNKNOWN, PromotionComponentSwizzle<2u>>>(inCPUImage, outCPUImage, region);
443+
performCopyUsingImageFilter<asset::CSwizzleAndConvertImageFilter<asset::EF_UNKNOWN, asset::EF_UNKNOWN, PromotionComponentSwizzle<2u>>>(inOffsetBaseLayer, outOffsetBaseLayer, inCPUImage, outCPUImage, region);
390444
else if (srcChannelCount == 3u)
391-
performCopyUsingImageFilter<asset::CSwizzleAndConvertImageFilter<asset::EF_UNKNOWN, asset::EF_UNKNOWN, PromotionComponentSwizzle<3u>>>(inCPUImage, outCPUImage, region);
445+
performCopyUsingImageFilter<asset::CSwizzleAndConvertImageFilter<asset::EF_UNKNOWN, asset::EF_UNKNOWN, PromotionComponentSwizzle<3u>>>(inOffsetBaseLayer, outOffsetBaseLayer, inCPUImage, outCPUImage, region);
392446
else
393-
performCopyUsingImageFilter<asset::CSwizzleAndConvertImageFilter<asset::EF_UNKNOWN, asset::EF_UNKNOWN, PromotionComponentSwizzle<4u>>>(inCPUImage, outCPUImage, region);
447+
performCopyUsingImageFilter<asset::CSwizzleAndConvertImageFilter<asset::EF_UNKNOWN, asset::EF_UNKNOWN, PromotionComponentSwizzle<4u>>>(inOffsetBaseLayer, outOffsetBaseLayer, inCPUImage, outCPUImage, region);
394448
}
395449
}
396450

@@ -508,52 +562,15 @@ bool ImageRegionIterator::advanceAndCopyToStagingBuffer(asset::IImage::SBufferCo
508562

509563
// ! Function to create mock cpu images that can go into image filters for copying/converting
510564
auto createMockInOutCPUImagesForFilter = [&](core::smart_refctd_ptr<asset::ICPUImage>& inCPUImage, core::smart_refctd_ptr<asset::ICPUImage>& outCPUImage, const size_t outCPUBufferSize) -> void
511-
{
512-
/*
513-
We have to first construct two `ICPUImage`s from each of those buffers `inCPUImage` and `outCPUImage`
514-
Then we will create fake ICPUBuffers that point to srcBuffer and stagingBuffer with correct offsets
515-
Then we have to set the buffer and regions for each one of those ICPUImages using setBufferAndRegions
516-
Finally we fill the filter state and `execute` which require in/out CPUImages
517-
*/
518-
565+
{
566+
// this one is cached because we can
567+
inCPUImage = imageFilterInCPUImages[currentRegion];
519568
auto dstImageParams = dstImage->getCreationParameters();
520569

521-
// inCPUImage is an image matching the params of dstImage but with the extents and layer count of the current region being copied and mipLevel 1u and the format being srcImageFormat
522-
// the buffer of this image is set to (srcBuffer+Offset) and the related region is set to cover the whole copy region (offset from 0)
523-
{
524-
auto inCpuImageRegionsDynArray = core::make_refctd_dynamic_array<core::smart_refctd_dynamic_array<asset::ICPUImage::SBufferCopy>>(1);
525-
auto& inCpuImageRegion = inCpuImageRegionsDynArray->front();
526-
inCpuImageRegion = {};
527-
inCpuImageRegion.bufferOffset = 0u;
528-
inCpuImageRegion.bufferRowLength = mainRegion.bufferRowLength;
529-
inCpuImageRegion.bufferImageHeight = mainRegion.bufferImageHeight;
530-
inCpuImageRegion.imageSubresource.aspectMask = mainRegion.imageSubresource.aspectMask;
531-
inCpuImageRegion.imageSubresource.mipLevel = 0u;
532-
inCpuImageRegion.imageSubresource.baseArrayLayer = 0u;
533-
inCpuImageRegion.imageOffset.x = 0u;
534-
inCpuImageRegion.imageOffset.y = 0u;
535-
inCpuImageRegion.imageOffset.z = 0u;
536-
inCpuImageRegion.imageExtent.width = regionToCopyNext.imageExtent.width;
537-
inCpuImageRegion.imageExtent.height = regionToCopyNext.imageExtent.height;
538-
inCpuImageRegion.imageExtent.depth = regionToCopyNext.imageExtent.depth;
539-
inCpuImageRegion.imageSubresource.layerCount = core::max(regionToCopyNext.imageSubresource.layerCount, 1u);
540-
541-
auto localImageOffset = core::vector4du32_SIMD(currentBlockInRow, currentRowInSlice, currentSliceInLayer, currentLayerInRegion);
542-
uint64_t offsetInCPUBuffer = mainRegion.bufferOffset + core::dot(localImageOffset, srcBufferByteStrides)[0];
543-
uint8_t* inCpuBufferPointer = const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(srcBuffer->getPointer()) + offsetInCPUBuffer);
544-
asset::ICPUImage::SCreationParams inCPUImageParams = dstImageParams;
545-
inCPUImageParams.flags = asset::IImage::ECF_NONE; // Because we may want to write to first few layers of CUBEMAP (<6) but it's not valid to create an Cube ICPUImage with less that 6 layers.
546-
inCPUImageParams.format = srcImageFormat;
547-
inCPUImageParams.extent = regionToCopyNext.imageExtent;
548-
inCPUImageParams.arrayLayers = regionToCopyNext.imageSubresource.layerCount;
549-
inCPUImageParams.mipLevels = 1u;
550-
inCPUImage = asset::ICPUImage::create(std::move(inCPUImageParams));
551-
assert(inCPUImage);
552-
core::smart_refctd_ptr<asset::ICPUBuffer> inCPUBuffer = core::make_smart_refctd_ptr< asset::CCustomAllocatorCPUBuffer<core::null_allocator<uint8_t>, true> >(srcBuffer->getSize(), inCpuBufferPointer, core::adopt_memory);
553-
inCPUImage->setBufferAndRegions(std::move(inCPUBuffer), inCpuImageRegionsDynArray);
554-
assert(inCPUImage->getBuffer());
555-
assert(inCPUImage->getRegions().size() > 0u);
556-
}
570+
// this one is not cached currently
571+
// because image creation depends on creating it with a buffer pointing to stagingBuffer memory pointer which we do not have access to in initialization time
572+
// [TODO] but maybe we could cache it by tricking the filtes to have the `stagingBufferOffset` with outOffsetBaseLayer
573+
// and we know we can because `stagingBufferOffset` is a multiple of block byte size, but range checks may fail?!
557574

558575
// outCPUImage is an image matching the params of dstImage but with the extents and layer count of the current region being copied and mipLevel 1u
559576
// the buffer of this image is set to (stagingBufferPointer + stagingBufferOffset) and the related region is set to cover the whole copy region (offset from 0)
@@ -612,7 +629,9 @@ bool ImageRegionIterator::advanceAndCopyToStagingBuffer(asset::IImage::SBufferCo
612629
core::smart_refctd_ptr<asset::ICPUImage> outCPUImage;
613630
createMockInOutCPUImagesForFilter(inCPUImage, outCPUImage, layersToUploadMemorySize);
614631

615-
bool copySuccess = performCopy(srcImageFormat, dstImageFormat, inCPUImage, outCPUImage, regionToCopyNext);
632+
const auto inOffsetBaseLayer = core::vector4du32_SIMD(currentBlockInRow, currentRowInSlice, currentSliceInLayer, currentLayerInRegion);
633+
const auto outOffsetBaseLayer = core::vector4du32_SIMD(currentBlockInRow, currentRowInSlice, currentSliceInLayer, currentLayerInRegion);
634+
bool copySuccess = performIntermediateCopy(srcImageFormat, dstImageFormat, inOffsetBaseLayer, outOffsetBaseLayer, inCPUImage, outCPUImage, regionToCopyNext);
616635

617636
if(copySuccess)
618637
{
@@ -650,7 +669,9 @@ bool ImageRegionIterator::advanceAndCopyToStagingBuffer(asset::IImage::SBufferCo
650669
core::smart_refctd_ptr<asset::ICPUImage> outCPUImage;
651670
createMockInOutCPUImagesForFilter(inCPUImage, outCPUImage, slicesToUploadMemorySize);
652671

653-
bool copySuccess = performCopy(srcImageFormat, dstImageFormat, inCPUImage, outCPUImage, regionToCopyNext);
672+
const auto inOffsetBaseLayer = core::vector4du32_SIMD(currentBlockInRow, currentRowInSlice, currentSliceInLayer, currentLayerInRegion);
673+
const auto outOffsetBaseLayer = core::vector4du32_SIMD(currentBlockInRow, currentRowInSlice, currentSliceInLayer, currentLayerInRegion);
674+
bool copySuccess = performIntermediateCopy(srcImageFormat, dstImageFormat, inOffsetBaseLayer, outOffsetBaseLayer, inCPUImage, outCPUImage, regionToCopyNext);
654675

655676
if(copySuccess)
656677
{
@@ -688,7 +709,9 @@ bool ImageRegionIterator::advanceAndCopyToStagingBuffer(asset::IImage::SBufferCo
688709
core::smart_refctd_ptr<asset::ICPUImage> outCPUImage;
689710
createMockInOutCPUImagesForFilter(inCPUImage, outCPUImage, rowsToUploadMemorySize);
690711

691-
bool copySuccess = performCopy(srcImageFormat, dstImageFormat, inCPUImage, outCPUImage, regionToCopyNext);
712+
const auto inOffsetBaseLayer = core::vector4du32_SIMD(currentBlockInRow, currentRowInSlice, currentSliceInLayer, currentLayerInRegion);
713+
const auto outOffsetBaseLayer = core::vector4du32_SIMD(currentBlockInRow, currentRowInSlice, currentSliceInLayer, currentLayerInRegion);
714+
bool copySuccess = performIntermediateCopy(srcImageFormat, dstImageFormat, inOffsetBaseLayer, outOffsetBaseLayer, inCPUImage, outCPUImage, regionToCopyNext);
692715

693716
if(copySuccess)
694717
{
@@ -727,7 +750,9 @@ bool ImageRegionIterator::advanceAndCopyToStagingBuffer(asset::IImage::SBufferCo
727750
core::smart_refctd_ptr<asset::ICPUImage> outCPUImage;
728751
createMockInOutCPUImagesForFilter(inCPUImage, outCPUImage, blocksToUploadMemorySize);
729752

730-
bool copySuccess = performCopy(srcImageFormat, dstImageFormat, inCPUImage, outCPUImage, regionToCopyNext);
753+
const auto inOffsetBaseLayer = core::vector4du32_SIMD(currentBlockInRow, currentRowInSlice, currentSliceInLayer, currentLayerInRegion);
754+
const auto outOffsetBaseLayer = core::vector4du32_SIMD(currentBlockInRow, currentRowInSlice, currentSliceInLayer, currentLayerInRegion);
755+
bool copySuccess = performIntermediateCopy(srcImageFormat, dstImageFormat, inOffsetBaseLayer, outOffsetBaseLayer, inCPUImage, outCPUImage, regionToCopyNext);
731756

732757
if(copySuccess)
733758
{

0 commit comments

Comments
 (0)