1313namespace vkcompute {
1414namespace api {
1515
16- /*
17- * Given the strides of a buffer-backed tensor, estimate the equivalent memory
18- * layout enum value by identifying the fastest moving dimension.
19- */
20- utils::GPUMemoryLayout estimate_memory_layout (
21- const std::vector<int64_t >& dim_order) {
22- int64_t fastest_dim_whcn = dim_order.size () - 1 - dim_order.back ();
23- if (fastest_dim_whcn >= 0 && fastest_dim_whcn < 3 ) {
24- return utils::GPUMemoryLayout (fastest_dim_whcn);
25- }
26-
27- // TODO(ssjia) find a way to gracefully recover from this case by i.e. adding
28- // a UNKOWN GPUMemoryLayout. This is not high priority though because we don't
29- // expect this to ever come up in practice.
30- VK_THROW (" No compatible GPUMemoryLayout value" );
31- }
32-
3316std::vector<int64_t > calculate_dim_order (
3417 const size_t ndim,
35- const utils::GPUMemoryLayout memory_layout ) {
18+ const int32_t packed_dim ) {
3619 // Special case for zero dim tensors
3720 if (ndim == 0 ) {
3821 return {0 };
3922 }
4023 std::vector<int64_t > dim_order (ndim);
41- int64_t last_dim =
42- ndim - utils::to_packed_dim_nchw_offset< int64_t >(memory_layout) ;
24+ // Explicitly convert ndim to signed to prevent underflow
25+ int64_t last_dim = int64_t ( ndim) - 1 - packed_dim ;
4326
4427 int64_t cur_dim = 0 ;
4528 for (int d = 0 ; d < ndim; ++d) {
@@ -149,7 +132,7 @@ std::vector<int64_t> unsqueeze_strides(
149132
150133std::vector<int64_t > calculate_padded_sizes (
151134 const std::vector<int64_t >& sizes,
152- const utils::GPUMemoryLayout memory_layout ) {
135+ const int32_t packed_dim ) {
153136 int64_t ndim = sizes.size ();
154137 if (ndim == 0 ) {
155138 ndim = 1 ;
@@ -163,8 +146,7 @@ std::vector<int64_t> calculate_padded_sizes(
163146 }
164147
165148 // Pad the packed dim to the next multiple of 4.
166- const int64_t dim_offset =
167- utils::to_packed_dim_nchw_offset<int64_t >(memory_layout);
149+ const int64_t dim_offset = packed_dim + 1 ;
168150 const int64_t padded_dim_size = utils::val_at (-dim_offset, sizes);
169151 padded_sizes.at (ndim_up4 - dim_offset) = utils::align_up_4 (padded_dim_size);
170152
@@ -174,7 +156,7 @@ std::vector<int64_t> calculate_padded_sizes(
174156utils::uvec3 calculate_image_extents (
175157 const std::vector<int64_t >& padded_sizes,
176158 const std::vector<int64_t >& axis_map,
177- const utils::GPUMemoryLayout memory_layout ) {
159+ const int32_t packed_dim ) {
178160 VK_CHECK_COND (padded_sizes.size () == 4 );
179161 VK_CHECK_COND (axis_map.size () == 4 );
180162
@@ -195,21 +177,8 @@ utils::uvec3 calculate_image_extents(
195177 // Multiply the extents of the batch axis by the batch size.
196178 extents[batch_axis] *= padded_sizes.at (0 );
197179
198- switch (memory_layout) {
199- case utils::kWidthPacked :
200- VK_CHECK_COND (extents[axis_map.at (0 )] % 4 == 0 );
201- extents[axis_map.at (0 )] /= 4 ;
202- break ;
203- case utils::kHeightPacked :
204- VK_CHECK_COND (extents[axis_map.at (1 )] % 4 == 0 );
205- extents[axis_map.at (1 )] /= 4 ;
206- break ;
207- case utils::kChannelsPacked :
208- VK_CHECK_COND (extents[axis_map.at (2 )] % 4 == 0 );
209- extents[axis_map.at (2 )] /= 4 ;
210- break ;
211- }
212-
180+ VK_CHECK_COND (extents[axis_map.at (packed_dim)] % 4 == 0 );
181+ extents[axis_map.at (packed_dim)] /= 4 ;
213182 return extents;
214183}
215184
@@ -285,15 +254,15 @@ vkapi::VulkanBuffer allocate_buffer(
285254vTensorStorage::vTensorStorage (
286255 Context* const context,
287256 const utils::StorageType storage_type,
288- const utils::GPUMemoryLayout gpu_memory_layout,
289257 const std::vector<int64_t >& axis_map,
258+ const int32_t packed_dim,
290259 const std::vector<int64_t >& padded_sizes,
291260 const vkapi::ScalarType dtype,
292261 const bool allocate_memory)
293262 : context_(context),
294263 storage_type_{storage_type},
295264 image_extents_ (
296- calculate_image_extents (padded_sizes, axis_map, gpu_memory_layout )),
265+ calculate_image_extents (padded_sizes, axis_map, packed_dim )),
297266 buffer_length_{utils::multiply_integers (padded_sizes)},
298267 buffer_offset_{0 },
299268 image_ (allocate_image(
@@ -408,14 +377,14 @@ vTensor::vTensor(
408377 const utils::GPUMemoryLayout memory_layout,
409378 const bool allocate_memory)
410379 : dtype_(dtype),
411- memory_layout_(memory_layout),
412380 // Calculate tensor metadata
413381 sizes_(sizes.begin(), sizes.end()),
414- dim_order_(calculate_dim_order(sizes_.size(), memory_layout_)),
382+ packed_dim_(utils::to_packed_dim<int32_t >(memory_layout)),
383+ dim_order_(calculate_dim_order(sizes_.size(), packed_dim_)),
415384 axis_map_(default_axis_map()),
416385 strides_(calculate_strides(sizes, dim_order_)),
417386 numel_(utils::multiply_integers(sizes_)),
418- padded_sizes_{calculate_padded_sizes (sizes, memory_layout_ )},
387+ padded_sizes_{calculate_padded_sizes (sizes, packed_dim_ )},
419388 unsqueezed_strides_{unsqueeze_strides (strides_, numel_)},
420389 padded_numel_ (utils::multiply_integers(padded_sizes_)),
421390 logical_limits_{{0 , 0 , 0 }},
@@ -429,8 +398,8 @@ vTensor::vTensor(
429398 storage_(
430399 context,
431400 storage_type,
432- memory_layout_,
433401 axis_map_,
402+ packed_dim_,
434403 padded_sizes_,
435404 dtype_,
436405 allocate_memory) {
@@ -451,9 +420,9 @@ vTensor::vTensor(
451420
452421vTensor::vTensor (const vTensor& other)
453422 : dtype_(other.dtype_),
454- memory_layout_(other.memory_layout_),
455423 // Copy tensor size metadata
456424 sizes_(other.sizes_.begin(), other.sizes_.end()),
425+ packed_dim_{other.packed_dim_ },
457426 dim_order_ (other.dim_order_.begin(), other.dim_order_.end()),
458427 axis_map_(other.axis_map_.begin(), other.axis_map_.end()),
459428 strides_(other.strides_.begin(), other.strides_.end()),
@@ -479,14 +448,14 @@ vTensor::vTensor(
479448 const std::vector<int64_t >& dim_order,
480449 const int64_t offset_numel)
481450 : dtype_(other.dtype_),
482- memory_layout_(estimate_memory_layout(dim_order)),
483451 // Copy tensor size metadata
484452 sizes_(sizes.begin(), sizes.end()),
453+ packed_dim_(other.packed_dim_),
485454 dim_order_(dim_order.begin(), dim_order.end()),
486455 axis_map_(default_axis_map()),
487456 strides_(calculate_strides(sizes_, dim_order_)),
488457 numel_(utils::multiply_integers(sizes_)),
489- padded_sizes_{calculate_padded_sizes (sizes, memory_layout_ )},
458+ padded_sizes_{calculate_padded_sizes (sizes, packed_dim_ )},
490459 unsqueezed_strides_{unsqueeze_strides (strides_, numel_)},
491460 padded_numel_ (utils::multiply_integers(padded_sizes_)),
492461 logical_limits_(other.logical_limits_),
@@ -542,6 +511,19 @@ void vTensor::set_logical_limits(const utils::uvec3& image_extents) {
542511 logical_limits_.limits [2 ] = image_extents[axis_map_.at (2 )];
543512}
544513
514+ utils::GPUMemoryLayout vTensor::estimate_memory_layout () const {
515+ switch (packed_dim_) {
516+ case WHCN::kWidthDim :
517+ return utils::kWidthPacked ;
518+ case WHCN::kHeightDim :
519+ return utils::kHeightPacked ;
520+ case WHCN::kChannelsDim :
521+ return utils::kChannelsPacked ;
522+ default :
523+ VK_THROW (" Invalid packed dim" );
524+ }
525+ }
526+
545527const vkapi::BufferBindInfo vTensor::sizes_ubo () {
546528 if (!sizes_uniform_.buffer ()) {
547529 sizes_uniform_ =
@@ -618,21 +600,16 @@ void vTensor::bind_allocation(const vkapi::Allocation& allocation) {
618600
619601void vTensor::update_metadata () {
620602 strides_ = calculate_strides (sizes_, dim_order_);
621- // Only update the memory layout for buffer-backed tensors. Strides are
622- // meaningless for texture-backed tensors and do not impact the memory layout.
623- if (storage_type () == utils::kBuffer ) {
624- memory_layout_ = estimate_memory_layout (dim_order_);
625- }
626603 numel_ = utils::multiply_integers (sizes_);
627604
628- padded_sizes_ = calculate_padded_sizes (sizes_, memory_layout_ );
605+ padded_sizes_ = calculate_padded_sizes (sizes_, packed_dim_ );
629606 unsqueezed_strides_ = unsqueeze_strides (strides_, numel_);
630607 padded_numel_ = utils::multiply_integers (padded_sizes_);
631608
632609 // Calculate the image extents that would have been used to allocate a texture
633610 // withthe current sizes, and use that to set the logical limits.
634611 set_logical_limits (
635- calculate_image_extents (padded_sizes_, axis_map_, memory_layout_ ));
612+ calculate_image_extents (padded_sizes_, axis_map_, packed_dim_ ));
636613
637614 if (sizes_uniform_.buffer ()) {
638615 sizes_uniform_.update (utils::make_whcn_ivec4 (sizes_));
@@ -656,7 +633,7 @@ void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
656633 // For texture storage check that the current texture is large enough for
657634 // the new sizes of the tensor.
658635 utils::uvec3 virtual_extents =
659- calculate_image_extents (padded_sizes_, axis_map_, memory_layout_ );
636+ calculate_image_extents (padded_sizes_, axis_map_, packed_dim_ );
660637
661638 bool valid_resize = virtual_extents[0 ] <= storage_.image_extents_ [0 ];
662639 valid_resize =
@@ -725,23 +702,23 @@ void transpose_dim_order_inplace(
725702
726703void vTensor::virtual_transpose (const int64_t dim0, const int64_t dim1) {
727704 std::iter_swap (sizes_.begin () + dim0, sizes_.begin () + dim1);
705+
706+ const int dim0_whcn = sizes_.size () - 1 - dim0;
707+ const int dim1_whcn = sizes_.size () - 1 - dim1;
708+ if (packed_dim_ == dim0_whcn) {
709+ packed_dim_ = dim1_whcn;
710+ }
711+ if (packed_dim_ == dim1_whcn) {
712+ packed_dim_ = dim0_whcn;
713+ }
714+
728715 if (storage_type () == utils::kBuffer ) {
729716 transpose_dim_order_inplace (dim_order_, dim0, dim1);
730717 } else {
731- const int dim0_whcn = sizes_.size () - 1 - dim0;
732- const int dim1_whcn = sizes_.size () - 1 - dim1;
733718 // Cannot transpose batch dimension for texture storage
734719 VK_CHECK_COND (dim0_whcn < 3 && dim1_whcn < 3 );
735-
736720 std::iter_swap (
737721 axis_map_.begin () + dim0_whcn, axis_map_.begin () + dim1_whcn);
738-
739- if (packed_dim_whcn_idx () == dim0_whcn) {
740- memory_layout_ = utils::GPUMemoryLayout (dim1_whcn);
741- }
742- if (packed_dim_whcn_idx () == dim1_whcn) {
743- memory_layout_ = utils::GPUMemoryLayout (dim0_whcn);
744- }
745722 }
746723 update_metadata ();
747724}
0 commit comments