@@ -229,19 +229,69 @@ ImageRegionIterator::ImageRegionIterator(
229
229
// [x] If Queue doesn't support GRAPHICS_BIT or COMPUTE_BIT -> must be multiple of 4
230
230
// [x] bufferOffset must be a multiple of texel block size in bytes
231
231
bufferOffsetAlignment = dstImageTexelBlockInfo.getBlockByteSize (); // can be non power of two
232
- if (asset::isDepthOrStencilFormat (dstImageFormat))
232
+ if (asset::isDepthOrStencilFormat (dstImageFormat))
233
233
bufferOffsetAlignment = std::lcm (bufferOffsetAlignment, 4u );
234
234
235
235
bool queueSupportsCompute = queueFamilyProps.queueFlags .hasFlags (IPhysicalDevice::EQF_COMPUTE_BIT);
236
236
bool queueSupportsGraphics = queueFamilyProps.queueFlags .hasFlags (IPhysicalDevice::EQF_GRAPHICS_BIT);
237
- if ((queueSupportsGraphics || queueSupportsCompute) == false )
237
+ if ((queueSupportsGraphics || queueSupportsCompute) == false )
238
238
bufferOffsetAlignment = std::lcm (bufferOffsetAlignment, 4u );
239
239
// TODO: Need to have a function to get equivalent format of the specific plane of this format (in aspectMask)
240
240
// if(asset::isPlanarFormat(dstImageFormat->getCreationParameters().format))
241
241
242
242
// Queues supporting graphics and/or compute operations must report (1,1,1) in minImageTransferGranularity, meaning that there are no additional restrictions on the granularity of image transfer operations for these queues.
243
243
// Other queues supporting image transfer operations are only required to support whole mip level transfers, thus minImageTransferGranularity for queues belonging to such queue families may be (0,0,0)
244
244
canTransferMipLevelsPartially = !(minImageTransferGranularity.width == 0 && minImageTransferGranularity.height == 0 && minImageTransferGranularity.depth == 0 );
245
+
246
+ auto dstImageParams = dstImage->getCreationParameters ();
247
+
248
+ /*
249
+ We have to first construct two `ICPUImage`s per Region named `inCPUImage` and `outCPUImage`
250
+ Then we will create fake ICPUBuffers that point to srcBuffer and stagingBuffer with correct offsets
251
+ Then we have to set the buffer and regions for each one of those ICPUImages using setBufferAndRegions
252
+ Finally we fill the filter state and `execute` which require in/out CPUImages
253
+ */
254
+
255
+ imageFilterInCPUImages.resize (regions.size ());
256
+ // imageFilterOutCPUImages.resize(regions.size());
257
+ for (uint32_t i = 0 ; i < copyRegions.size (); ++i)
258
+ {
259
+ auto & inCPUImage = imageFilterInCPUImages[i];
260
+ const auto region = regions[i];
261
+ // inCPUImage is an image matching the params of dstImage but with the extents and layer count of the current region being copied and mipLevel 1u and the format being srcImageFormat
262
+ // the buffer of this image is set to (srcBuffer+Offset) and the related region is set to cover the whole copy region (offset from 0)
263
+ auto inCpuImageRegionsDynArray = core::make_refctd_dynamic_array<core::smart_refctd_dynamic_array<asset::ICPUImage::SBufferCopy>>(1 );
264
+ auto & inCpuImageRegion = inCpuImageRegionsDynArray->front ();
265
+ inCpuImageRegion = {};
266
+ inCpuImageRegion.bufferOffset = 0u ;
267
+ inCpuImageRegion.bufferRowLength = region.bufferRowLength ;
268
+ inCpuImageRegion.bufferImageHeight = region.bufferImageHeight ;
269
+ inCpuImageRegion.imageSubresource .aspectMask = region.imageSubresource .aspectMask ;
270
+ inCpuImageRegion.imageSubresource .mipLevel = 0u ;
271
+ inCpuImageRegion.imageSubresource .baseArrayLayer = 0u ;
272
+ inCpuImageRegion.imageOffset .x = 0u ;
273
+ inCpuImageRegion.imageOffset .y = 0u ;
274
+ inCpuImageRegion.imageOffset .z = 0u ;
275
+ inCpuImageRegion.imageExtent .width = region.imageExtent .width ;
276
+ inCpuImageRegion.imageExtent .height = region.imageExtent .height ;
277
+ inCpuImageRegion.imageExtent .depth = region.imageExtent .depth ;
278
+ inCpuImageRegion.imageSubresource .layerCount = region.imageSubresource .layerCount ;
279
+
280
+ uint64_t offsetInCPUBuffer = region.bufferOffset ;
281
+ uint8_t * inCpuBufferPointer = const_cast <uint8_t *>(reinterpret_cast <const uint8_t *>(srcBuffer->getPointer ()) + offsetInCPUBuffer);
282
+ asset::ICPUImage::SCreationParams inCPUImageParams = dstImageParams;
283
+ inCPUImageParams.flags = asset::IImage::ECF_NONE; // Because we may want to write to first few layers of CUBEMAP (<6) but it's not valid to create an Cube ICPUImage with less that 6 layers.
284
+ inCPUImageParams.format = srcImageFormat;
285
+ inCPUImageParams.extent = region.imageExtent ;
286
+ inCPUImageParams.arrayLayers = region.imageSubresource .layerCount ;
287
+ inCPUImageParams.mipLevels = 1u ; // since we copy one mip at a time to our dst image, it doesn't matter at the stage when we copy from cpu memory to staging memory
288
+ inCPUImage = asset::ICPUImage::create (std::move (inCPUImageParams));
289
+ assert (inCPUImage);
290
+ core::smart_refctd_ptr<asset::ICPUBuffer> inCPUBuffer = core::make_smart_refctd_ptr< asset::CCustomAllocatorCPUBuffer<core::null_allocator<uint8_t >, true > >(srcBuffer->getSize (), inCpuBufferPointer, core::adopt_memory);
291
+ inCPUImage->setBufferAndRegions (std::move (inCPUBuffer), inCpuImageRegionsDynArray);
292
+ assert (inCPUImage->getBuffer ());
293
+ assert (inCPUImage->getRegions ().size () > 0u );
294
+ }
245
295
}
246
296
247
297
size_t ImageRegionIterator::getMemoryNeededForRemainingRegions () const
@@ -347,6 +397,8 @@ struct PromotionComponentSwizzle
347
397
348
398
template <typename Filter>
349
399
bool performCopyUsingImageFilter (
400
+ const core::vector4du32_SIMD& inOffsetBaseLayer,
401
+ const core::vector4du32_SIMD& ouOffsetBaseLayer,
350
402
const core::smart_refctd_ptr<asset::ICPUImage>& inCPUImage,
351
403
const core::smart_refctd_ptr<asset::ICPUImage>& outCPUImage,
352
404
const asset::IImage::SBufferCopy& region)
@@ -357,7 +409,7 @@ bool performCopyUsingImageFilter(
357
409
state.layerCount = region.imageSubresource .layerCount ;
358
410
state.inImage = inCPUImage.get ();
359
411
state.outImage = outCPUImage.get ();
360
- state.inOffsetBaseLayer = core::vectorSIMDu32 ( 0u ) ;
412
+ state.inOffsetBaseLayer = inOffsetBaseLayer ;
361
413
state.outOffsetBaseLayer = core::vectorSIMDu32 (0u );
362
414
state.inMipLevel = 0u ;
363
415
state.outMipLevel = 0u ;
@@ -368,29 +420,31 @@ bool performCopyUsingImageFilter(
368
420
return false ;
369
421
}
370
422
371
- bool performCopy (
423
+ bool performIntermediateCopy (
372
424
asset::E_FORMAT srcImageFormat,
373
425
asset::E_FORMAT dstImageFormat,
426
+ const core::vector4du32_SIMD& inOffsetBaseLayer,
427
+ const core::vector4du32_SIMD& outOffsetBaseLayer,
374
428
const core::smart_refctd_ptr<asset::ICPUImage>& inCPUImage,
375
429
const core::smart_refctd_ptr<asset::ICPUImage>& outCPUImage,
376
430
const asset::IImage::SBufferCopy& region)
377
431
{
378
432
// In = srcBuffer, Out = stagingBuffer
379
433
if (srcImageFormat == dstImageFormat)
380
434
{
381
- return performCopyUsingImageFilter<asset::CCopyImageFilter>(inCPUImage, outCPUImage, region);
435
+ return performCopyUsingImageFilter<asset::CCopyImageFilter>(inOffsetBaseLayer, outOffsetBaseLayer, inCPUImage, outCPUImage, region);
382
436
}
383
437
else
384
438
{
385
439
auto srcChannelCount = asset::getFormatChannelCount (srcImageFormat);
386
440
if (srcChannelCount == 1u )
387
- performCopyUsingImageFilter<asset::CSwizzleAndConvertImageFilter<asset::EF_UNKNOWN, asset::EF_UNKNOWN, PromotionComponentSwizzle<1u >>>(inCPUImage, outCPUImage, region);
441
+ performCopyUsingImageFilter<asset::CSwizzleAndConvertImageFilter<asset::EF_UNKNOWN, asset::EF_UNKNOWN, PromotionComponentSwizzle<1u >>>(inOffsetBaseLayer, outOffsetBaseLayer, inCPUImage, outCPUImage, region);
388
442
else if (srcChannelCount == 2u )
389
- performCopyUsingImageFilter<asset::CSwizzleAndConvertImageFilter<asset::EF_UNKNOWN, asset::EF_UNKNOWN, PromotionComponentSwizzle<2u >>>(inCPUImage, outCPUImage, region);
443
+ performCopyUsingImageFilter<asset::CSwizzleAndConvertImageFilter<asset::EF_UNKNOWN, asset::EF_UNKNOWN, PromotionComponentSwizzle<2u >>>(inOffsetBaseLayer, outOffsetBaseLayer, inCPUImage, outCPUImage, region);
390
444
else if (srcChannelCount == 3u )
391
- performCopyUsingImageFilter<asset::CSwizzleAndConvertImageFilter<asset::EF_UNKNOWN, asset::EF_UNKNOWN, PromotionComponentSwizzle<3u >>>(inCPUImage, outCPUImage, region);
445
+ performCopyUsingImageFilter<asset::CSwizzleAndConvertImageFilter<asset::EF_UNKNOWN, asset::EF_UNKNOWN, PromotionComponentSwizzle<3u >>>(inOffsetBaseLayer, outOffsetBaseLayer, inCPUImage, outCPUImage, region);
392
446
else
393
- performCopyUsingImageFilter<asset::CSwizzleAndConvertImageFilter<asset::EF_UNKNOWN, asset::EF_UNKNOWN, PromotionComponentSwizzle<4u >>>(inCPUImage, outCPUImage, region);
447
+ performCopyUsingImageFilter<asset::CSwizzleAndConvertImageFilter<asset::EF_UNKNOWN, asset::EF_UNKNOWN, PromotionComponentSwizzle<4u >>>(inOffsetBaseLayer, outOffsetBaseLayer, inCPUImage, outCPUImage, region);
394
448
}
395
449
}
396
450
@@ -508,52 +562,15 @@ bool ImageRegionIterator::advanceAndCopyToStagingBuffer(asset::IImage::SBufferCo
508
562
509
563
// ! Function to create mock cpu images that can go into image filters for copying/converting
510
564
auto createMockInOutCPUImagesForFilter = [&](core::smart_refctd_ptr<asset::ICPUImage>& inCPUImage, core::smart_refctd_ptr<asset::ICPUImage>& outCPUImage, const size_t outCPUBufferSize) -> void
511
- {
512
- /*
513
- We have to first construct two `ICPUImage`s from each of those buffers `inCPUImage` and `outCPUImage`
514
- Then we will create fake ICPUBuffers that point to srcBuffer and stagingBuffer with correct offsets
515
- Then we have to set the buffer and regions for each one of those ICPUImages using setBufferAndRegions
516
- Finally we fill the filter state and `execute` which require in/out CPUImages
517
- */
518
-
565
+ {
566
+ // this one is cached because we can
567
+ inCPUImage = imageFilterInCPUImages[currentRegion];
519
568
auto dstImageParams = dstImage->getCreationParameters ();
520
569
521
- // inCPUImage is an image matching the params of dstImage but with the extents and layer count of the current region being copied and mipLevel 1u and the format being srcImageFormat
522
- // the buffer of this image is set to (srcBuffer+Offset) and the related region is set to cover the whole copy region (offset from 0)
523
- {
524
- auto inCpuImageRegionsDynArray = core::make_refctd_dynamic_array<core::smart_refctd_dynamic_array<asset::ICPUImage::SBufferCopy>>(1 );
525
- auto & inCpuImageRegion = inCpuImageRegionsDynArray->front ();
526
- inCpuImageRegion = {};
527
- inCpuImageRegion.bufferOffset = 0u ;
528
- inCpuImageRegion.bufferRowLength = mainRegion.bufferRowLength ;
529
- inCpuImageRegion.bufferImageHeight = mainRegion.bufferImageHeight ;
530
- inCpuImageRegion.imageSubresource .aspectMask = mainRegion.imageSubresource .aspectMask ;
531
- inCpuImageRegion.imageSubresource .mipLevel = 0u ;
532
- inCpuImageRegion.imageSubresource .baseArrayLayer = 0u ;
533
- inCpuImageRegion.imageOffset .x = 0u ;
534
- inCpuImageRegion.imageOffset .y = 0u ;
535
- inCpuImageRegion.imageOffset .z = 0u ;
536
- inCpuImageRegion.imageExtent .width = regionToCopyNext.imageExtent .width ;
537
- inCpuImageRegion.imageExtent .height = regionToCopyNext.imageExtent .height ;
538
- inCpuImageRegion.imageExtent .depth = regionToCopyNext.imageExtent .depth ;
539
- inCpuImageRegion.imageSubresource .layerCount = core::max (regionToCopyNext.imageSubresource .layerCount , 1u );
540
-
541
- auto localImageOffset = core::vector4du32_SIMD (currentBlockInRow, currentRowInSlice, currentSliceInLayer, currentLayerInRegion);
542
- uint64_t offsetInCPUBuffer = mainRegion.bufferOffset + core::dot (localImageOffset, srcBufferByteStrides)[0 ];
543
- uint8_t * inCpuBufferPointer = const_cast <uint8_t *>(reinterpret_cast <const uint8_t *>(srcBuffer->getPointer ()) + offsetInCPUBuffer);
544
- asset::ICPUImage::SCreationParams inCPUImageParams = dstImageParams;
545
- inCPUImageParams.flags = asset::IImage::ECF_NONE; // Because we may want to write to first few layers of CUBEMAP (<6) but it's not valid to create an Cube ICPUImage with less that 6 layers.
546
- inCPUImageParams.format = srcImageFormat;
547
- inCPUImageParams.extent = regionToCopyNext.imageExtent ;
548
- inCPUImageParams.arrayLayers = regionToCopyNext.imageSubresource .layerCount ;
549
- inCPUImageParams.mipLevels = 1u ;
550
- inCPUImage = asset::ICPUImage::create (std::move (inCPUImageParams));
551
- assert (inCPUImage);
552
- core::smart_refctd_ptr<asset::ICPUBuffer> inCPUBuffer = core::make_smart_refctd_ptr< asset::CCustomAllocatorCPUBuffer<core::null_allocator<uint8_t >, true > >(srcBuffer->getSize (), inCpuBufferPointer, core::adopt_memory);
553
- inCPUImage->setBufferAndRegions (std::move (inCPUBuffer), inCpuImageRegionsDynArray);
554
- assert (inCPUImage->getBuffer ());
555
- assert (inCPUImage->getRegions ().size () > 0u );
556
- }
570
+ // this one is not cached currently
571
+ // because image creation depends on creating it with a buffer pointing to stagingBuffer memory pointer which we do not have access to in initialization time
572
+ // [TODO] but maybe we could cache it by tricking the filtes to have the `stagingBufferOffset` with outOffsetBaseLayer
573
+ // and we know we can because `stagingBufferOffset` is a multiple of block byte size, but range checks may fail?!
557
574
558
575
// outCPUImage is an image matching the params of dstImage but with the extents and layer count of the current region being copied and mipLevel 1u
559
576
// the buffer of this image is set to (stagingBufferPointer + stagingBufferOffset) and the related region is set to cover the whole copy region (offset from 0)
@@ -612,7 +629,9 @@ bool ImageRegionIterator::advanceAndCopyToStagingBuffer(asset::IImage::SBufferCo
612
629
core::smart_refctd_ptr<asset::ICPUImage> outCPUImage;
613
630
createMockInOutCPUImagesForFilter (inCPUImage, outCPUImage, layersToUploadMemorySize);
614
631
615
- bool copySuccess = performCopy (srcImageFormat, dstImageFormat, inCPUImage, outCPUImage, regionToCopyNext);
632
+ const auto inOffsetBaseLayer = core::vector4du32_SIMD (currentBlockInRow, currentRowInSlice, currentSliceInLayer, currentLayerInRegion);
633
+ const auto outOffsetBaseLayer = core::vector4du32_SIMD (currentBlockInRow, currentRowInSlice, currentSliceInLayer, currentLayerInRegion);
634
+ bool copySuccess = performIntermediateCopy (srcImageFormat, dstImageFormat, inOffsetBaseLayer, outOffsetBaseLayer, inCPUImage, outCPUImage, regionToCopyNext);
616
635
617
636
if (copySuccess)
618
637
{
@@ -650,7 +669,9 @@ bool ImageRegionIterator::advanceAndCopyToStagingBuffer(asset::IImage::SBufferCo
650
669
core::smart_refctd_ptr<asset::ICPUImage> outCPUImage;
651
670
createMockInOutCPUImagesForFilter (inCPUImage, outCPUImage, slicesToUploadMemorySize);
652
671
653
- bool copySuccess = performCopy (srcImageFormat, dstImageFormat, inCPUImage, outCPUImage, regionToCopyNext);
672
+ const auto inOffsetBaseLayer = core::vector4du32_SIMD (currentBlockInRow, currentRowInSlice, currentSliceInLayer, currentLayerInRegion);
673
+ const auto outOffsetBaseLayer = core::vector4du32_SIMD (currentBlockInRow, currentRowInSlice, currentSliceInLayer, currentLayerInRegion);
674
+ bool copySuccess = performIntermediateCopy (srcImageFormat, dstImageFormat, inOffsetBaseLayer, outOffsetBaseLayer, inCPUImage, outCPUImage, regionToCopyNext);
654
675
655
676
if (copySuccess)
656
677
{
@@ -688,7 +709,9 @@ bool ImageRegionIterator::advanceAndCopyToStagingBuffer(asset::IImage::SBufferCo
688
709
core::smart_refctd_ptr<asset::ICPUImage> outCPUImage;
689
710
createMockInOutCPUImagesForFilter (inCPUImage, outCPUImage, rowsToUploadMemorySize);
690
711
691
- bool copySuccess = performCopy (srcImageFormat, dstImageFormat, inCPUImage, outCPUImage, regionToCopyNext);
712
+ const auto inOffsetBaseLayer = core::vector4du32_SIMD (currentBlockInRow, currentRowInSlice, currentSliceInLayer, currentLayerInRegion);
713
+ const auto outOffsetBaseLayer = core::vector4du32_SIMD (currentBlockInRow, currentRowInSlice, currentSliceInLayer, currentLayerInRegion);
714
+ bool copySuccess = performIntermediateCopy (srcImageFormat, dstImageFormat, inOffsetBaseLayer, outOffsetBaseLayer, inCPUImage, outCPUImage, regionToCopyNext);
692
715
693
716
if (copySuccess)
694
717
{
@@ -727,7 +750,9 @@ bool ImageRegionIterator::advanceAndCopyToStagingBuffer(asset::IImage::SBufferCo
727
750
core::smart_refctd_ptr<asset::ICPUImage> outCPUImage;
728
751
createMockInOutCPUImagesForFilter (inCPUImage, outCPUImage, blocksToUploadMemorySize);
729
752
730
- bool copySuccess = performCopy (srcImageFormat, dstImageFormat, inCPUImage, outCPUImage, regionToCopyNext);
753
+ const auto inOffsetBaseLayer = core::vector4du32_SIMD (currentBlockInRow, currentRowInSlice, currentSliceInLayer, currentLayerInRegion);
754
+ const auto outOffsetBaseLayer = core::vector4du32_SIMD (currentBlockInRow, currentRowInSlice, currentSliceInLayer, currentLayerInRegion);
755
+ bool copySuccess = performIntermediateCopy (srcImageFormat, dstImageFormat, inOffsetBaseLayer, outOffsetBaseLayer, inCPUImage, outCPUImage, regionToCopyNext);
731
756
732
757
if (copySuccess)
733
758
{
0 commit comments