Skip to content

Commit 89b4bd0

Browse files
committed
optimalCopyRowPitchAlignment considered
1 parent bb412f8 commit 89b4bd0

File tree

2 files changed

+51
-47
lines changed

2 files changed

+51
-47
lines changed

include/nbl/video/utilities/IUtilities.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1033,15 +1033,24 @@ class NBL_API ImageRegionIterator
10331033
uint32_t getCurrentLayerInRegion() const { return currentLayerInRegion; }
10341034
uint32_t getCurrentRegion() const { return currentRegion; }
10351035

1036+
inline core::vector3du32_SIMD getOptimalCopyTexelStrides(const asset::VkExtent3D& copyExtents) const
1037+
{
1038+
return core::vector3du32_SIMD(
1039+
core::alignUp(copyExtents.width, optimalRowPitchAlignment),
1040+
copyExtents.height,
1041+
copyExtents.depth);
1042+
}
1043+
10361044
private:
1045+
10371046
core::SRange<const asset::IImage::SBufferCopy> regions;
10381047

10391048
// Mock CPU Images used to copy cpu buffer to staging buffer
10401049
std::vector<core::smart_refctd_ptr<asset::ICPUImage>> imageFilterInCPUImages;
10411050
core::smart_refctd_dynamic_array<asset::ICPUImage::SBufferCopy> outCPUImageRegions; // needs to be updated before each upload
10421051
std::vector<core::smart_refctd_ptr<asset::ICPUImage>> imageFilterOutCPUImages;
10431052

1044-
uint32_t optimalRowPitchAlignment = 1u;
1053+
size_t optimalRowPitchAlignment = 1u;
10451054
bool canTransferMipLevelsPartially = false;
10461055
asset::VkExtent3D minImageTransferGranularity = {};
10471056
uint32_t bufferOffsetAlignment = 1u;

src/nbl/video/utilities/IUtilities.cpp

Lines changed: 41 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ IGPUQueue::SSubmitInfo IUtilities::updateImageViaStagingBuffer(
6767
return intendedNextSubmit;
6868
}
6969

70-
ImageRegionIterator regionIterator = ImageRegionIterator(regions, queueFamProps, srcBuffer, srcFormat, dstImage, 64u/*limits.optimalBufferCopyRowPitchAlignment*/);
70+
ImageRegionIterator regionIterator = ImageRegionIterator(regions, queueFamProps, srcBuffer, srcFormat, dstImage, limits.optimalBufferCopyRowPitchAlignment);
7171

7272
// Assuming each thread can handle minImageTranferGranularitySize of texelBlocks:
7373
const uint32_t maxResidentImageTransferSize = limits.maxResidentInvocations * texelBlockInfo.getBlockByteSize() * (minImageTransferGranularity.width * minImageTransferGranularity.height * minImageTransferGranularity.depth);
@@ -86,10 +86,9 @@ IGPUQueue::SSubmitInfo IUtilities::updateImageViaStagingBuffer(
8686
uint32_t memoryLowerBound = maxResidentImageTransferSize;
8787
{
8888
const asset::IImage::SBufferCopy & region = regions[regionIterator.getCurrentRegion()];
89-
auto imageExtent = core::vector3du32_SIMD(region.imageExtent.width, region.imageExtent.height, region.imageExtent.depth);
90-
auto imageExtentInBlocks = texelBlockInfo.convertTexelsToBlocks(imageExtent);
91-
auto imageExtentBlockStridesInBytes = texelBlockInfo.convert3DBlockStridesTo1DByteStrides(imageExtentInBlocks);
92-
memoryLowerBound = core::max(memoryLowerBound, imageExtentBlockStridesInBytes[1]); // rowPitch = imageExtentBlockStridesInBytes[1]
89+
const auto copyTexelStrides = regionIterator.getOptimalCopyTexelStrides(region.imageExtent);
90+
const auto byteStrides = texelBlockInfo.convert3DTexelStridesTo1DByteStrides(copyTexelStrides);
91+
memoryLowerBound = core::max(memoryLowerBound, byteStrides[1]); // max of memoryLowerBound and copy rowPitch
9392
}
9493

9594
uint32_t localOffset = video::StreamingTransientDataBufferMT<>::invalid_value;
@@ -218,6 +217,7 @@ ImageRegionIterator::ImageRegionIterator(
218217
, currentSliceInLayer(0u)
219218
, currentLayerInRegion(0u)
220219
, currentRegion(0u)
220+
, optimalRowPitchAlignment(optimalRowPitchAlignment)
221221
{
222222
dstImageFormat = dstImage->getCreationParameters().format;
223223
if(srcImageFormat == asset::EF_UNKNOWN)
@@ -328,15 +328,12 @@ size_t ImageRegionIterator::getMemoryNeededForRemainingRegions() const
328328
{
329329
const asset::IImage::SBufferCopy & region = regions[i];
330330

331-
// auto optimalRegion = region;
332-
// optimalRegion.bufferRowLength = core::alignUp(optimalRegion.bufferRowLength, optimalRowPitchAlignment);
333-
auto imageExtent = core::vector3du32_SIMD(region.imageExtent.width, region.imageExtent.height, region.imageExtent.depth);
334-
auto imageExtentInBlocks = dstImageTexelBlockInfo.convertTexelsToBlocks(imageExtent);
335-
336-
// TODO: This needs to change with optimal rowpitch
337-
auto imageExtentBlockStridesInBytes = dstImageTexelBlockInfo.convert3DBlockStridesTo1DByteStrides(imageExtentInBlocks);
331+
auto imageExtentInBlocks = dstImageTexelBlockInfo.convertTexelsToBlocks(core::vector3du32_SIMD(region.imageExtent.width, region.imageExtent.height, region.imageExtent.depth));
332+
333+
const auto copyTexelStrides = getOptimalCopyTexelStrides(region.imageExtent);
334+
const core::vector4du32_SIMD copyByteStrides = dstImageTexelBlockInfo.convert3DTexelStridesTo1DByteStrides(copyTexelStrides);
338335

339-
if(i == currentRegion)
336+
if (i == currentRegion)
340337
{
341338
auto remainingBlocksInRow = imageExtentInBlocks.x - currentBlockInRow;
342339
auto remainingRowsInSlice = imageExtentInBlocks.y - currentRowInSlice;
@@ -345,42 +342,42 @@ size_t ImageRegionIterator::getMemoryNeededForRemainingRegions() const
345342

346343
if (currentBlockInRow == 0 && currentRowInSlice == 0 && currentSliceInLayer == 0 && remainingLayersInRegion > 0)
347344
{
348-
incrementMemoryNeeded(imageExtentBlockStridesInBytes[3] * remainingLayersInRegion);
345+
incrementMemoryNeeded(copyByteStrides[3] * remainingLayersInRegion);
349346
}
350347
else if (currentBlockInRow == 0 && currentRowInSlice == 0 && currentSliceInLayer > 0)
351348
{
352-
incrementMemoryNeeded(imageExtentBlockStridesInBytes[2] * remainingSlicesInLayer);
349+
incrementMemoryNeeded(copyByteStrides[2] * remainingSlicesInLayer);
353350
if (remainingLayersInRegion > 1u)
354-
incrementMemoryNeeded(imageExtentBlockStridesInBytes[3] * (remainingLayersInRegion - 1u));
351+
incrementMemoryNeeded(copyByteStrides[3] * (remainingLayersInRegion - 1u));
355352
}
356353
else if (currentBlockInRow == 0 && currentRowInSlice > 0)
357354
{
358-
incrementMemoryNeeded(imageExtentBlockStridesInBytes[1] * remainingRowsInSlice);
355+
incrementMemoryNeeded(copyByteStrides[1] * remainingRowsInSlice);
359356

360-
if(remainingSlicesInLayer > 1u)
361-
incrementMemoryNeeded(imageExtentBlockStridesInBytes[2] * (remainingSlicesInLayer - 1u));
362-
if(remainingLayersInRegion > 1u)
363-
incrementMemoryNeeded(imageExtentBlockStridesInBytes[3] * (remainingLayersInRegion - 1u));
357+
if (remainingSlicesInLayer > 1u)
358+
incrementMemoryNeeded(copyByteStrides[2] * (remainingSlicesInLayer - 1u));
359+
if (remainingLayersInRegion > 1u)
360+
incrementMemoryNeeded(copyByteStrides[3] * (remainingLayersInRegion - 1u));
364361
}
365362
else if (currentBlockInRow > 0)
366363
{
367364
// want to first fill the remaining blocks in current row
368-
incrementMemoryNeeded(imageExtentBlockStridesInBytes[0] * remainingBlocksInRow);
365+
incrementMemoryNeeded(copyByteStrides[0] * remainingBlocksInRow);
369366
// then fill the remaining rows in current slice
370-
if(remainingRowsInSlice > 1u)
371-
incrementMemoryNeeded(imageExtentBlockStridesInBytes[1] * (remainingRowsInSlice - 1u));
367+
if (remainingRowsInSlice > 1u)
368+
incrementMemoryNeeded(copyByteStrides[1] * (remainingRowsInSlice - 1u));
372369
// then fill the remaining slices in current layer
373-
if(remainingSlicesInLayer > 1u)
374-
incrementMemoryNeeded(imageExtentBlockStridesInBytes[2] * (remainingSlicesInLayer - 1u));
370+
if (remainingSlicesInLayer > 1u)
371+
incrementMemoryNeeded(copyByteStrides[2] * (remainingSlicesInLayer - 1u));
375372
// then fill the remaining layers in current region
376-
if(remainingLayersInRegion > 1u)
377-
incrementMemoryNeeded(imageExtentBlockStridesInBytes[3] * (remainingLayersInRegion - 1u));
373+
if (remainingLayersInRegion > 1u)
374+
incrementMemoryNeeded(copyByteStrides[3] * (remainingLayersInRegion - 1u));
378375
}
379376
}
380377
else
381378
{
382379
// we want to fill the whole layers in the region
383-
incrementMemoryNeeded(imageExtentBlockStridesInBytes[3] * region.imageSubresource.layerCount); // = blockByteSize * imageExtentInBlocks.x * imageExtentInBlocks.y * imageExtentInBlocks.z * region.imageSubresource.layerCount
380+
incrementMemoryNeeded(copyByteStrides[3] * region.imageSubresource.layerCount); // = blockByteSize * imageExtentInBlocks.x * imageExtentInBlocks.y * imageExtentInBlocks.z * region.imageSubresource.layerCount
384381
}
385382
}
386383
return memoryNeededForRemainingRegions;
@@ -495,11 +492,9 @@ bool ImageRegionIterator::advanceAndCopyToStagingBuffer(asset::IImage::SBufferCo
495492
}
496493

497494
const asset::TexelBlockInfo dstImageTexelBlockInfo(dstImageFormat);
498-
const asset::TexelBlockInfo srcImageTexelBlockInfo(srcImageFormat);
499495

500496
// ! Current Region that may break down into smaller regions (the first smaller region is nextRegionToCopy)
501497
const asset::IImage::SBufferCopy & mainRegion = regions[currentRegion];
502-
const core::vector4du32_SIMD srcBufferByteStrides = mainRegion.getByteStrides(srcImageTexelBlockInfo);
503498

504499
// ! We only need subresourceSize for validations and assertions about minImageTransferGranularity because granularity requirements can be ignored if region fits against the right corner of the subresource (described in more detail below)
505500
const auto subresourceSize = dstImage->getMipSize(mainRegion.imageSubresource.mipLevel);
@@ -511,9 +506,9 @@ bool ImageRegionIterator::advanceAndCopyToStagingBuffer(asset::IImage::SBufferCo
511506
const auto imageOffsetInBlocks = dstImageTexelBlockInfo.convertTexelsToBlocks(core::vector3du32_SIMD(mainRegion.imageOffset.x, mainRegion.imageOffset.y, mainRegion.imageOffset.z));
512507
const auto imageExtentInBlocks = dstImageTexelBlockInfo.convertTexelsToBlocks(core::vector3du32_SIMD(mainRegion.imageExtent.width, mainRegion.imageExtent.height, mainRegion.imageExtent.depth));
513508

514-
// TODO: This needs to change with optimal rowpitch
515-
const core::vector4du32_SIMD imageExtentBlockStridesInBytes = dstImageTexelBlockInfo.convert3DBlockStridesTo1DByteStrides(imageExtentInBlocks);
516-
509+
const auto copyTexelStrides = getOptimalCopyTexelStrides(mainRegion.imageExtent);
510+
const core::vector4du32_SIMD copyByteStrides = dstImageTexelBlockInfo.convert3DTexelStridesTo1DByteStrides(copyTexelStrides);
511+
517512
// region <-> region.imageSubresource.layerCount <-> imageExtentInBlocks.z <-> imageExtentInBlocks.y <-> imageExtentInBlocks.x
518513
auto updateCurrentOffsets = [&]() -> void
519514
{
@@ -542,10 +537,10 @@ bool ImageRegionIterator::advanceAndCopyToStagingBuffer(asset::IImage::SBufferCo
542537
}
543538
};
544539

545-
uint32_t eachBlockNeededMemory = imageExtentBlockStridesInBytes[0]; // = blockByteSize
546-
uint32_t eachRowNeededMemory = imageExtentBlockStridesInBytes[1]; // = blockByteSize * imageExtentInBlocks.x
547-
uint32_t eachSliceNeededMemory = imageExtentBlockStridesInBytes[2]; // = blockByteSize * imageExtentInBlocks.x * imageExtentInBlocks.y
548-
uint32_t eachLayerNeededMemory = imageExtentBlockStridesInBytes[3]; // = blockByteSize * imageExtentInBlocks.x * imageExtentInBlocks.y * imageExtentInBlocks.z
540+
uint32_t eachBlockNeededMemory = copyByteStrides[0]; // = blockByteSize
541+
uint32_t eachRowNeededMemory = copyByteStrides[1]; // = blockByteSize * copyBlockStrides.x
542+
uint32_t eachSliceNeededMemory = copyByteStrides[2]; // = blockByteSize * copyBlockStrides.x * copyBlockStrides.y
543+
uint32_t eachLayerNeededMemory = copyByteStrides[3]; // = blockByteSize * copyBlockStrides.x * copyBlockStrides.y * copyBlockStrides.z
549544

550545
// There is remaining layers in region that needs copying
551546
uint32_t uploadableArrayLayers = availableMemory / eachLayerNeededMemory;
@@ -606,8 +601,8 @@ bool ImageRegionIterator::advanceAndCopyToStagingBuffer(asset::IImage::SBufferCo
606601
uint32_t layersToUploadMemorySize = eachLayerNeededMemory * uploadableArrayLayers;
607602

608603
regionToCopyNext.bufferOffset = stagingBufferOffset;
609-
regionToCopyNext.bufferRowLength = imageExtentInBlocks.x * texelBlockDim.x;
610-
regionToCopyNext.bufferImageHeight = imageExtentInBlocks.y * texelBlockDim.y;
604+
regionToCopyNext.bufferRowLength = copyTexelStrides.x;
605+
regionToCopyNext.bufferImageHeight = copyTexelStrides.y;
611606
regionToCopyNext.imageSubresource.aspectMask = mainRegion.imageSubresource.aspectMask;
612607
regionToCopyNext.imageSubresource.mipLevel = mainRegion.imageSubresource.mipLevel;
613608
regionToCopyNext.imageSubresource.baseArrayLayer = mainRegion.imageSubresource.baseArrayLayer + currentLayerInRegion;
@@ -645,8 +640,8 @@ bool ImageRegionIterator::advanceAndCopyToStagingBuffer(asset::IImage::SBufferCo
645640
uint32_t slicesToUploadMemorySize = eachSliceNeededMemory * uploadableSlices;
646641

647642
regionToCopyNext.bufferOffset = stagingBufferOffset;
648-
regionToCopyNext.bufferRowLength = imageExtentInBlocks.x * texelBlockDim.x;
649-
regionToCopyNext.bufferImageHeight = imageExtentInBlocks.y * texelBlockDim.y;
643+
regionToCopyNext.bufferRowLength = copyTexelStrides.x;
644+
regionToCopyNext.bufferImageHeight = copyTexelStrides.y;
650645
regionToCopyNext.imageSubresource.aspectMask = mainRegion.imageSubresource.aspectMask;
651646
regionToCopyNext.imageSubresource.mipLevel = mainRegion.imageSubresource.mipLevel;
652647
regionToCopyNext.imageSubresource.baseArrayLayer = mainRegion.imageSubresource.baseArrayLayer + currentLayerInRegion;
@@ -684,8 +679,8 @@ bool ImageRegionIterator::advanceAndCopyToStagingBuffer(asset::IImage::SBufferCo
684679
uint32_t rowsToUploadMemorySize = eachRowNeededMemory * uploadableRows;
685680

686681
regionToCopyNext.bufferOffset = stagingBufferOffset;
687-
regionToCopyNext.bufferRowLength = imageExtentInBlocks.x * texelBlockDim.x;
688-
regionToCopyNext.bufferImageHeight = imageExtentInBlocks.y * texelBlockDim.y;
682+
regionToCopyNext.bufferRowLength = copyTexelStrides.x;
683+
regionToCopyNext.bufferImageHeight = copyTexelStrides.y;
689684
regionToCopyNext.imageSubresource.aspectMask = mainRegion.imageSubresource.aspectMask;
690685
regionToCopyNext.imageSubresource.mipLevel = mainRegion.imageSubresource.mipLevel;
691686
regionToCopyNext.imageSubresource.baseArrayLayer = mainRegion.imageSubresource.baseArrayLayer + currentLayerInRegion;
@@ -724,8 +719,8 @@ bool ImageRegionIterator::advanceAndCopyToStagingBuffer(asset::IImage::SBufferCo
724719
uint32_t blocksToUploadMemorySize = eachBlockNeededMemory * uploadableBlocks;
725720

726721
regionToCopyNext.bufferOffset = stagingBufferOffset;
727-
regionToCopyNext.bufferRowLength = imageExtentInBlocks.x * texelBlockDim.x;
728-
regionToCopyNext.bufferImageHeight = imageExtentInBlocks.y * texelBlockDim.y;
722+
regionToCopyNext.bufferRowLength = copyTexelStrides.x;
723+
regionToCopyNext.bufferImageHeight = copyTexelStrides.y;
729724
regionToCopyNext.imageSubresource.aspectMask = mainRegion.imageSubresource.aspectMask;
730725
regionToCopyNext.imageSubresource.mipLevel = mainRegion.imageSubresource.mipLevel;
731726
regionToCopyNext.imageSubresource.baseArrayLayer = mainRegion.imageSubresource.baseArrayLayer + currentLayerInRegion;

0 commit comments

Comments
 (0)