diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp index a85229b2b86..43ebbfecbc6 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.cpp +++ b/backends/vulkan/runtime/api/containers/Tensor.cpp @@ -143,6 +143,43 @@ bool dim_order_is_valid(const std::vector& dim_order) { return sum == n * (n + 1) / 2; } +/* + * Applies the following transformations to a tensor's dim_order vector: + * 1. Reverse the order of elements so that the fastest moving dimensions are + * first. + * 2. Convert NCHW dimension indices to WHCN indices, so that 0 represents the + * width dimension, 1 represents the height dimension, and 2 represents the + * channels dimension. + * 3. Unsqueeze the dim_order vector to the next multiple of 4. + + * These transformations make it easier to use the dim order in a compute shader + */ +std::vector create_whcn_dim_order( + const std::vector& dim_order) { + size_t ndim = dim_order.size(); + std::vector whcn_order(ndim); + + // Convert from NCHW to WHCN index, and flip the dim order so that the fastest + // moving dimension is first. + // example: { 1, 2, 0} -> { 2, 0, 1} + // {height, width, channels} -> {channels, width, height} + for (size_t whcn_i = 0, nchw_i = (ndim - 1); whcn_i < ndim; + ++whcn_i, --nchw_i) { + whcn_order.at(whcn_i) = ndim - 1 - dim_order.at(nchw_i); + } + + // Unsqueeze to the next multiple of 4 + size_t ndim_up4 = utils::align_up_4(ndim); + whcn_order.resize(ndim_up4); + + // Append unsqueezed dimensions + for (size_t i = ndim; i < ndim_up4; ++i) { + whcn_order.at(i) = i; + } + + return whcn_order; +} + std::vector unsqueeze_strides( const std::vector& strides, const int64_t numel) { @@ -212,6 +249,97 @@ utils::uvec3 calculate_image_extents( return extents; } +/* + * The physical image extents describe the size of an allocated texture resource + * i.e. how many texels in the width, height and depth axis of the image. + * However, the axis map allows a tensor logical dimension to map to a different + * physical texture axis; in essence, it describes a permutation between the + * logical width, height, channels, etc. dimensions of a tensor and the width, + * height, depth axis of a texture. + * + * The "logical extents" is simply the physical image extents permuted by the + * axis mapping. The logical extents is useful for constructing global work + * group sizes, so that it is easier to convert the global thread ID to a + * tensor index. + */ +utils::uvec3 calculate_logical_limits( + const utils::uvec3& image_extents, + const std::vector& axis_map) { + return { + image_extents[axis_map.at(0)], + image_extents[axis_map.at(1)], + image_extents[axis_map.at(2)], + }; +} + +/* + * Convenience overload of the above function to calculate logical limits + * directly from tensor sizes. + */ +utils::uvec3 calculate_logical_limits( + const std::vector& sizes, + const std::vector& axis_map, + const int32_t packed_dim) { + return calculate_logical_limits( + calculate_image_extents( + calculate_padded_sizes(sizes, packed_dim), axis_map, packed_dim), + axis_map); +} + +int64_t calculate_gpu_buffer_numel( + Context* const context, + const std::vector& sizes, + const utils::uvec3 image_extents, + const utils::StorageType storage_type, + const vkapi::ScalarType dtype) { + // For texture backed tensors, simply multiply the total number of texels by 4 + if (storage_type != utils::kBuffer) { + return image_extents[0] * image_extents[1] * image_extents[2] * 4; + } + const bool is_int8 = dtype == vkapi::kChar; + const bool int8_supported = + context->adapter_ptr()->has_full_int8_buffers_support(); + const size_t numel = utils::multiply_integers(sizes); + // For int8 tensors, if the device does not support int8 buffers, then int32 + // is used instead to represent the buffer data. Therefore the number of + // elements in the buffer is aligned to the next multiple of 4. + if (is_int8 && int8_supported) { + return utils::align_up_4(numel); + } + return numel; +} + +int32_t pack_into_int32(const std::vector& vec, const int32_t extra) { + int32_t packed = static_cast( + vec.at(0) + (vec.at(1) << 4) + (vec.at(2) << 8) + (vec.at(3) << 12) + + (extra << 16)); + return packed; +} + +int32_t create_hashed_layout( + const std::vector& dim_order, + const std::vector& axis_map, + const int32_t packed_dim, + const utils::StorageType storage_type) { + if (storage_type == utils::kBuffer) { + return pack_into_int32(create_whcn_dim_order(dim_order), 0); + } + return pack_into_int32(axis_map, packed_dim); +} + +size_t calculate_max_ubo_nbytes( + const size_t nbytes_per_ubo, + const utils::StorageType storage_type) { + // For texture backed tensors, the metadata fields needed are: + // sizes, logical limits + size_t max_metadata_field_count = 2u; + if (storage_type == utils::kBuffer) { + // sizes, strides, dim order, numel + max_metadata_field_count = 4u; + } + return max_metadata_field_count * nbytes_per_ubo; +} + // // vTensorStorage // @@ -322,14 +450,21 @@ vTensorStorage::vTensorStorage( const utils::StorageType storage_type, const std::vector& axis_map, const int32_t packed_dim, - const std::vector& padded_sizes, + const std::vector& sizes, const vkapi::ScalarType dtype, const bool allocate_memory) : context_(context), storage_type_{storage_type}, - image_extents_( - calculate_image_extents(padded_sizes, axis_map, packed_dim)), - buffer_length_{utils::multiply_integers(padded_sizes)}, + image_extents_(calculate_image_extents( + calculate_padded_sizes(sizes, packed_dim), + axis_map, + packed_dim)), + buffer_length_{calculate_gpu_buffer_numel( + context_, + sizes, + image_extents_, + storage_type, + dtype)}, buffer_offset_{0}, image_(allocate_image( context_, @@ -446,35 +581,45 @@ vTensor::vTensor( dim_order_(calculate_dim_order(sizes_.size(), packed_dim_)), axis_map_(calculate_axis_map(sizes_, axis_map_layout)), strides_(calculate_strides(sizes, dim_order_)), - padded_sizes_{calculate_padded_sizes(sizes, packed_dim_)}, - unsqueezed_strides_{ - unsqueeze_strides(strides_, utils::multiply_integers(sizes_))}, - padded_numel_(utils::multiply_integers(padded_sizes_)), + numel_(utils::multiply_integers(sizes_)), + hashed_layout_(create_hashed_layout( + dim_order_, + axis_map_, + packed_dim_, + storage_type)), + // Related to tensor metadata UBOs + nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()}, + max_ubo_nbytes_{calculate_max_ubo_nbytes(nbytes_per_ubo_, storage_type)}, uniforms_(), - // Utility Uniform Buffers that can be passed to shaders as arguments - uniforms_size_(0), - sizes_uniform_offset_(kUniformOffsetUnset), - unsqueezed_strides_offset_(kUniformOffsetUnset), - numel_uniform_offset_(kUniformOffsetUnset), - logical_limits_uniform_offset_(kUniformOffsetUnset), // Construct Tensor storage storage_(std::make_shared( context, storage_type, axis_map_, packed_dim_, - padded_sizes_, + sizes, dtype_, allocate_memory)) { + // Derived metadata + std::vector whcn_dim_order(4, 0); + std::vector unsqueezed_strides(4, 0); + // Only calculate derived metadata if needed for the desired storage type. + // Note that logical limits may be used by buffer storage as well in order to + // set global work group sizes for some compute shaders. + if (storage_type == utils::kBuffer) { + whcn_dim_order = create_whcn_dim_order(dim_order_); + unsqueezed_strides = unsqueeze_strides(strides_, numel_); + } + uniform_data_ = std::make_shared(UniformData{ sizes_, - unsqueezed_strides_, - {{0, 0, 0}}, - static_cast(utils::multiply_integers(sizes_))}); + whcn_dim_order, + unsqueezed_strides, + TextureLimits( + calculate_logical_limits(storage_->image_extents_, axis_map_)), + numel_}); VK_CHECK_COND( dim_order_is_valid(dim_order_), "computed dim order is invalid"); - - set_logical_limits(storage_->image_extents_); } // NOLINTNEXTLINE @@ -490,24 +635,23 @@ vTensor::vTensor( dim_order_(), axis_map_(calculate_axis_map(sizes_, axis_map_layout)), strides_(), - padded_sizes_(calculate_padded_sizes(sizes_, packed_dim_)), - unsqueezed_strides_(), - padded_numel_(utils::multiply_integers(padded_sizes_)), + numel_(utils::multiply_integers(sizes_)), + hashed_layout_(create_hashed_layout( + dim_order_, + axis_map_, + packed_dim_, + utils::kTexture3D)), + // Related to tensor metadata UBOs + nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()}, + max_ubo_nbytes_{ + calculate_max_ubo_nbytes(nbytes_per_ubo_, utils::kTexture3D)}, uniforms_(), - // Utility Uniform Buffers that can be passed to shaders as arguments - uniforms_size_(0), - sizes_uniform_offset_(kUniformOffsetUnset), - unsqueezed_strides_offset_(kUniformOffsetUnset), - numel_uniform_offset_(kUniformOffsetUnset), - logical_limits_uniform_offset_(kUniformOffsetUnset), // Construct Tensor storage storage_(std::make_shared(context, image)) { - uniform_data_ = std::make_shared(UniformData{ - sizes_, - {0, 0, 0, 0}, - {{0, 0, 0}}, - static_cast(utils::multiply_integers(sizes_))}); - set_logical_limits(storage_->image_extents_); + TextureLimits logical_limits( + calculate_logical_limits(storage_->image_extents_, axis_map_)); + uniform_data_ = std::make_shared( + UniformData{sizes_, {0, 0, 0, 0}, {0, 0, 0, 0}, logical_limits, numel_}); } vTensor::vTensor(vTensor& other) @@ -518,18 +662,11 @@ vTensor::vTensor(vTensor& other) dim_order_(other.dim_order_.begin(), other.dim_order_.end()), axis_map_(other.axis_map_.begin(), other.axis_map_.end()), strides_(other.strides_.begin(), other.strides_.end()), - padded_sizes_{other.padded_sizes_.begin(), other.padded_sizes_.end()}, - unsqueezed_strides_{ - other.unsqueezed_strides_.begin(), - other.unsqueezed_strides_.end()}, - padded_numel_(other.padded_numel_), + numel_(other.numel_), + hashed_layout_(other.hashed_layout_), + nbytes_per_ubo_{other.nbytes_per_ubo_}, + max_ubo_nbytes_{other.max_ubo_nbytes_}, uniforms_(), - // Empty initialize Utility Uniform Buffers - uniforms_size_(0), - sizes_uniform_offset_(kUniformOffsetUnset), - unsqueezed_strides_offset_(kUniformOffsetUnset), - numel_uniform_offset_(kUniformOffsetUnset), - logical_limits_uniform_offset_(kUniformOffsetUnset), // Copy Tensor storage storage_(other.storage_) { uniform_data_ = std::make_shared(*other.get_uniform_data()); @@ -546,22 +683,21 @@ vTensor::vTensor( dim_order_(dim_order.begin(), dim_order.end()), axis_map_(calculate_axis_map(sizes_, utils::kDefaultAxisMap)), strides_(calculate_strides(sizes_, dim_order_)), - padded_sizes_{calculate_padded_sizes(sizes, packed_dim_)}, - unsqueezed_strides_{ - unsqueeze_strides(strides_, utils::multiply_integers(sizes_))}, - padded_numel_(utils::multiply_integers(padded_sizes_)), + numel_(other.numel_), + hashed_layout_(create_hashed_layout( + dim_order_, + axis_map_, + packed_dim_, + other.storage_type())), + nbytes_per_ubo_{other.nbytes_per_ubo_}, + max_ubo_nbytes_{other.max_ubo_nbytes_}, uniforms_(), - // Empty initialize Utility Uniform Buffers - uniforms_size_(0), - sizes_uniform_offset_(kUniformOffsetUnset), - unsqueezed_strides_offset_(kUniformOffsetUnset), - numel_uniform_offset_(kUniformOffsetUnset), - logical_limits_uniform_offset_(kUniformOffsetUnset), // Copy Tensor storage storage_(other.storage_) { uniform_data_ = std::make_shared(UniformData{ sizes_, - unsqueezed_strides_, + create_whcn_dim_order(dim_order_), + unsqueeze_strides(strides_, numel_), {other.logical_limits()}, static_cast(utils::multiply_integers(sizes_))}); @@ -584,6 +720,7 @@ uint32_t vTensor::UniformData::write_attribute( } switch (attr) { WRITE_ATTRIBUTE_CASE(SIZES, sizes_v); + WRITE_ATTRIBUTE_CASE(WHCN_DIM_ORDER, whcn_dim_order_v); WRITE_ATTRIBUTE_CASE(STRIDES, strides_v); WRITE_ATTRIBUTE_CASE(LOGICAL_LIMITS, logical_limits); WRITE_ATTRIBUTE_CASE(NUMEL, numel); @@ -624,12 +761,6 @@ vkapi::VulkanBuffer& vTensor::buffer( return storage_->buffer_; } -void vTensor::set_logical_limits(const utils::uvec3& image_extents) { - uniform_data_->logical_limits.limits[0] = image_extents[axis_map_.at(0)]; - uniform_data_->logical_limits.limits[1] = image_extents[axis_map_.at(1)]; - uniform_data_->logical_limits.limits[2] = image_extents[axis_map_.at(2)]; -} - utils::GPUMemoryLayout vTensor::estimate_memory_layout() const { switch (packed_dim_) { case WHCN::kWidthDim: @@ -643,95 +774,108 @@ utils::GPUMemoryLayout vTensor::estimate_memory_layout() const { } } +bool vTensor::is_contiguous() const { + if (storage_type() != utils::kBuffer) { + return false; + } + for (size_t i = 0; i < dim_order_.size(); ++i) { + if (dim_order_.at(i) != i) { + return false; + } + } + return true; +} + +size_t vTensor::get_max_ubo_nbytes(const size_t nbytes_per_ubo) const { + // For texture backed tensors, the metadata fields needed are: + // sizes, logical limits + size_t max_metadata_field_count = 2u; + if (storage_type() == utils::kBuffer) { + // sizes, strides, dim order, numel + max_metadata_field_count = 4u; + } + return max_metadata_field_count * nbytes_per_ubo; +} + const vkapi::BufferBindInfo vTensor::sizes_ubo() { - const size_t size_per_ubo = - storage_->context_->adapter_ptr()->min_ubo_alignment(); - const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo; if (!uniforms_.buffer()) { - uniforms_ = ParamsBuffer(storage_->context_, max_ubo_size, true); + uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true); } if (sizes_uniform_offset_ == kUniformOffsetUnset) { VK_CHECK_COND( - (uniforms_size_ + size_per_ubo) <= max_ubo_size, + (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_, "Uniform data allocation has exceeded Tensor uniform buffer size"); sizes_uniform_offset_ = uniforms_size_; - uniforms_size_ += size_per_ubo; + uniforms_size_ += nbytes_per_ubo_; uniforms_.update(utils::make_whcn_ivec4(sizes_), sizes_uniform_offset_); } return vkapi::BufferBindInfo( - uniforms_.buffer(), sizes_uniform_offset_, size_per_ubo); + uniforms_.buffer(), sizes_uniform_offset_, nbytes_per_ubo_); } -const vkapi::BufferBindInfo vTensor::strides_ubo() { - const size_t size_per_ubo = - storage_->context_->adapter_ptr()->min_ubo_alignment(); - const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo; +const vkapi::BufferBindInfo vTensor::dim_order_ubo() { if (!uniforms_.buffer()) { - uniforms_ = ParamsBuffer(storage_->context_, max_ubo_size, true); + uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true); } - if (unsqueezed_strides_offset_ == kUniformOffsetUnset) { + if (dim_order_uniform_offset_ == kUniformOffsetUnset) { VK_CHECK_COND( - (uniforms_size_ + size_per_ubo) <= max_ubo_size, + (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_, "Uniform data allocation has exceeded Tensor uniform buffer size"); - unsqueezed_strides_offset_ = uniforms_size_; - uniforms_size_ += size_per_ubo; + dim_order_uniform_offset_ = uniforms_size_; + uniforms_size_ += nbytes_per_ubo_; uniforms_.update( - utils::make_whcn_ivec4(unsqueezed_strides_), - unsqueezed_strides_offset_); + uniform_data_->whcn_dim_order_v, dim_order_uniform_offset_); + } + return vkapi::BufferBindInfo( + uniforms_.buffer(), dim_order_uniform_offset_, nbytes_per_ubo_); +} + +const vkapi::BufferBindInfo vTensor::strides_ubo() { + if (!uniforms_.buffer()) { + uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true); + } + if (strides_uniform_offset == kUniformOffsetUnset) { + VK_CHECK_COND( + (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_, + "Uniform data allocation has exceeded Tensor uniform buffer size"); + strides_uniform_offset = uniforms_size_; + uniforms_size_ += nbytes_per_ubo_; + uniforms_.update(uniform_data_->strides_v, strides_uniform_offset); } return vkapi::BufferBindInfo( - uniforms_.buffer(), unsqueezed_strides_offset_, size_per_ubo); + uniforms_.buffer(), strides_uniform_offset, nbytes_per_ubo_); } const vkapi::BufferBindInfo vTensor::logical_limits_ubo() { - const size_t size_per_ubo = - storage_->context_->adapter_ptr()->min_ubo_alignment(); - const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo; if (!uniforms_.buffer()) { - uniforms_ = ParamsBuffer(storage_->context_, max_ubo_size, true); + uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true); } if (logical_limits_uniform_offset_ == kUniformOffsetUnset) { VK_CHECK_COND( - (uniforms_size_ + size_per_ubo) <= max_ubo_size, + (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_, "Uniform data allocation has exceeded Tensor uniform buffer size"); logical_limits_uniform_offset_ = uniforms_size_; - uniforms_size_ += size_per_ubo; + uniforms_size_ += nbytes_per_ubo_; uniforms_.update(logical_limits(), logical_limits_uniform_offset_); } return vkapi::BufferBindInfo( - uniforms_.buffer(), logical_limits_uniform_offset_, size_per_ubo); + uniforms_.buffer(), logical_limits_uniform_offset_, nbytes_per_ubo_); } const vkapi::BufferBindInfo vTensor::numel_ubo() { - const size_t size_per_ubo = - storage_->context_->adapter_ptr()->min_ubo_alignment(); - const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo; if (!uniforms_.buffer()) { - uniforms_ = ParamsBuffer(storage_->context_, max_ubo_size, true); + uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true); } if (numel_uniform_offset_ == kUniformOffsetUnset) { VK_CHECK_COND( - (uniforms_size_ + size_per_ubo) <= max_ubo_size, + (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_, "Uniform data allocation has exceeded Tensor uniform buffer size"); numel_uniform_offset_ = uniforms_size_; - uniforms_size_ += size_per_ubo; + uniforms_size_ += nbytes_per_ubo_; uniforms_.update(numel(), numel_uniform_offset_); } return vkapi::BufferBindInfo( - uniforms_.buffer(), numel_uniform_offset_, size_per_ubo); -} - -size_t vTensor::staging_buffer_numel() const { - const bool is_int8 = dtype_ == vkapi::kChar; - const bool int8_supported = - storage_->context_->adapter_ptr()->has_full_int8_buffers_support(); - if (is_int8 && !int8_supported) { - return utils::align_up_4(numel()); - } - if (storage_type() == utils::kBuffer) { - return numel(); - } - return padded_numel_; + uniforms_.buffer(), numel_uniform_offset_, nbytes_per_ubo_); } VkMemoryRequirements vTensor::get_memory_requirements() const { @@ -758,33 +902,36 @@ void vTensor::bind_allocation(const vkapi::Allocation& allocation) { } void vTensor::update_metadata() { + numel_ = utils::multiply_integers(sizes_); strides_ = calculate_strides(sizes_, dim_order_); - uniform_data_->numel = utils::multiply_integers(sizes_); - - padded_sizes_ = calculate_padded_sizes(sizes_, packed_dim_); - unsqueezed_strides_ = unsqueeze_strides(strides_, numel()); - padded_numel_ = utils::multiply_integers(padded_sizes_); // Update uniform data if it has been modified + uniform_data_->numel = numel_; uniform_data_->sizes_v = utils::make_whcn_ivec4(sizes_); - uniform_data_->strides_v = utils::make_whcn_ivec4(unsqueezed_strides_); - - // Calculate the image extents that would have been used to allocate a texture - // withthe current sizes, and use that to set the logical limits. - set_logical_limits( - calculate_image_extents(padded_sizes_, axis_map_, packed_dim_)); + uniform_data_->whcn_dim_order_v = + utils::make_ivec4(create_whcn_dim_order(dim_order_)); + uniform_data_->strides_v = + utils::make_whcn_ivec4(unsqueeze_strides(strides_, numel_)); + uniform_data_->numel = utils::safe_downcast(numel_); + uniform_data_->logical_limits.limits = + calculate_logical_limits(sizes_, axis_map_, packed_dim_); if (sizes_uniform_offset_ != kUniformOffsetUnset) { uniforms_.update(uniform_data_->sizes_v, sizes_uniform_offset_); } - if (unsqueezed_strides_offset_ != kUniformOffsetUnset) { - uniforms_.update(uniform_data_->strides_v, unsqueezed_strides_offset_); + if (dim_order_uniform_offset_ != kUniformOffsetUnset) { + uniforms_.update( + uniform_data_->whcn_dim_order_v, dim_order_uniform_offset_); + } + if (strides_uniform_offset != kUniformOffsetUnset) { + uniforms_.update(uniform_data_->strides_v, strides_uniform_offset); } if (numel_uniform_offset_ != kUniformOffsetUnset) { - uniforms_.update(numel(), numel_uniform_offset_); + uniforms_.update(numel_, numel_uniform_offset_); } if (logical_limits_uniform_offset_ != kUniformOffsetUnset) { - uniforms_.update(logical_limits(), logical_limits_uniform_offset_); + uniforms_.update( + uniform_data_->logical_limits.limits, logical_limits_uniform_offset_); } } @@ -792,8 +939,8 @@ void vTensor::check_sizes(const std::vector& sizes) const { if (storage_type() != utils::kBuffer) { // For texture storage check that the current texture is large enough for // the new sizes of the tensor. - utils::uvec3 virtual_extents = - calculate_image_extents(padded_sizes_, axis_map_, packed_dim_); + utils::uvec3 virtual_extents = calculate_image_extents( + calculate_padded_sizes(sizes_, packed_dim_), axis_map_, packed_dim_); bool valid_resize = virtual_extents[0] <= storage_->image_extents_[0]; valid_resize = @@ -828,6 +975,11 @@ void vTensor::virtual_reconfigure( check_sizes(new_sizes); sizes_ = new_sizes; dim_order_ = new_dim_order; + + // Update the hashed layout because dim order is updated + hashed_layout_ = + create_hashed_layout(dim_order_, axis_map_, packed_dim_, storage_type()); + update_metadata(); } @@ -837,6 +989,7 @@ void vTensor::virtual_clone(const vTensor& other) { dim_order_ = other.dim_order_; axis_map_ = other.axis_map_; packed_dim_ = other.packed_dim_; + hashed_layout_ = other.hashed_layout_; *uniform_data_ = *other.get_uniform_data(); } @@ -895,6 +1048,11 @@ void vTensor::virtual_transpose(const int64_t dim0, const int64_t dim1) { axis_map_.at(3) = dim0_whcn; } } + + // Update the hashed layout because dim order / axis mpa is updated + hashed_layout_ = + create_hashed_layout(dim_order_, axis_map_, packed_dim_, storage_type()); + update_metadata(); } diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h index 850dc2d7fab..78a24d87e77 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.h +++ b/backends/vulkan/runtime/api/containers/Tensor.h @@ -81,6 +81,18 @@ struct LastAccess { : stage{stage_flags}, access{access_flags} {} }; +/* + * Calculate the number of elements that a GPU buffer would require to store the + * contents of a tensor. This will depend on the storage type and dtype of the + * tensor, as well as the features available on the device. + */ +int64_t calculate_gpu_buffer_numel( + Context* const context, + const std::vector& sizes, + const utils::uvec3 image_extents, + const utils::StorageType storage_type, + const vkapi::ScalarType dtype); + class vTensorStorage final { public: // Do not allow empty vTensorStorage construction @@ -91,7 +103,7 @@ class vTensorStorage final { const utils::StorageType storage_type, const std::vector& axis_map, const int32_t packed_dim, - const std::vector& padded_sizes, + const std::vector& sizes, const vkapi::ScalarType dtype, const bool allocate_memory = true); @@ -140,6 +152,10 @@ class vTensorStorage final { void verify() const; public: + inline size_t buffer_len() const { + return utils::safe_downcast(buffer_length_); + } + inline VkFormat texture_format() { return image_.format(); } @@ -207,8 +223,11 @@ class vTensor final { vTensor(vTensor&& other) = default; vTensor& operator=(vTensor&& other) = default; + ~vTensor() = default; + enum class Attribute : uint8_t { SIZES, + WHCN_DIM_ORDER, STRIDES, LOGICAL_LIMITS, NUMEL, @@ -216,6 +235,7 @@ class vTensor final { class UniformData { utils::ivec4 sizes_v; + utils::ivec4 whcn_dim_order_v; utils::ivec4 strides_v; // See the comments documenting logical_limits() for more context. TextureLimits logical_limits; @@ -227,10 +247,12 @@ class vTensor final { UniformData( const std::vector& sizes, + const std::vector& whcn_dim_order, const std::vector& strides, const TextureLimits& logical_limits, const size_t numel_ll) : sizes_v(utils::make_whcn_ivec4(sizes)), + whcn_dim_order_v(utils::make_ivec4(whcn_dim_order)), strides_v(utils::make_whcn_ivec4(strides)), logical_limits(logical_limits), numel(utils::safe_downcast(numel_ll)) {} @@ -293,21 +315,17 @@ class vTensor final { // strides of the tensor in NCHW dimension order std::vector strides_; - /* - * The below metadata members are derived from the above, and are typically - * to i.e. pass tensor metadata to compute shaders. - */ + // number of elements based on the canonical sizes + size_t numel_; + + // For texture backed tensors, this int32 contains the axis map data packed + // into a single int32. For buffer backed tensors, this int32 contains the + // wchn dim order data packed into a single int32. + int32_t hashed_layout_; - // padded sizes of the tensor in NCHW dimension order. See the - // calculate_padded_sizes() function for more context. Note that padded sizes - // are only used for texture storage, and not for buffer storage. - std::vector padded_sizes_; - // Contains the strides of the tensor, with the dimensionality padded to the - // nearest multiple of 4. Unsqueezed dims will have a stride of int32_t max. - std::vector unsqueezed_strides_; - // Contains the number of elements in the tensor according to the padded - // sizes. - size_t padded_numel_; + // Pre-compute these quantities to avoid frequent re-computation + size_t nbytes_per_ubo_; + size_t max_ubo_nbytes_; /* * Utility GPU buffer that can be passed to shaders in order to convey tensor @@ -320,15 +338,13 @@ class vTensor final { * context about the data contained in each buffer. */ ParamsBuffer uniforms_; - uint32_t uniforms_size_; - uint32_t sizes_uniform_offset_; - uint32_t unsqueezed_strides_offset_; - uint32_t numel_uniform_offset_; - uint32_t logical_limits_uniform_offset_; - // Maximum number of metadata fields that can be stored in the metadata UBO. - // This is used to calculate the size of the UBO that should be allocated. - constexpr static size_t kMaxMetadataFieldCount = 4; + uint32_t uniforms_size_ = 0u; + uint32_t sizes_uniform_offset_ = kUniformOffsetUnset; + uint32_t dim_order_uniform_offset_ = kUniformOffsetUnset; + uint32_t strides_uniform_offset = kUniformOffsetUnset; + uint32_t numel_uniform_offset_ = kUniformOffsetUnset; + uint32_t logical_limits_uniform_offset_ = kUniformOffsetUnset; // Initial value of uniform buffer offsets. 1 is selected as it is essentially // impossible for a ubo to have an offset of 1. @@ -381,9 +397,6 @@ class vTensor final { return storage_->storage_type_ == utils::kBuffer; } - private: - void set_logical_limits(const utils::uvec3& image_extents); - public: /* * The logical limits of the tensor are derived from the image extents of the @@ -451,21 +464,37 @@ class vTensor final { return dim_order_; } + inline const std::vector& strides() const { + return strides_; + } + + inline size_t numel() const { + return numel_; + } + + inline size_t nbytes() const { + return element_size(dtype()) * numel(); + } + inline const std::vector& axis_map() const { return axis_map_; } /* - * Returns a single int32_t that contains the values of the axis map and the - * packed dimension packed into a single int32_t, such that it can be used as - * a specialization constant in a compute shader. This allows for the SPIR-V - * to bytecode compilation to perform compile-time unfolding on the axis map. - * Each element of the axis map and the value of the packed dimension take up - * 4 bits in the packed int32_t. + * For texture backed tensors, this function return a int32_t that contains + * the axis map + packed dimension. Each element of the axis map occupies 4 + * bits of the int32. + * + * For buffer backed tensors, the int32_t contains the WHCN dim order, where + * each element of the dim order array occupies 4 bits of the int32. + * + * This int32 is typically consumed as a specialization constant in compute + * shaders where it is subsequently unpacked. The layout data of a vTensor + * instance is typically static once created, which is why this method is + * appropriate. */ inline int32_t hashed_layout() const { - return axis_map_.at(0) + (axis_map_.at(1) << 4) + (axis_map_.at(2) << 8) + - (axis_map_.at(3) << 12) + (packed_dim_ << 16); + return hashed_layout_; } /* @@ -478,57 +507,48 @@ class vTensor final { return axis_map_.at(0) == 0 && axis_map_.at(1) == 1 && axis_map_.at(2) == 2; } - inline const std::vector& strides() const { - return strides_; - } + /* + * Return true if a buffer backed tensor's dim order matches that of a + * contiguous tensor, i.e. the dim order will be {0, 1, 2, ... }. + * Returns false for texture backed tensors. + */ + bool is_contiguous() const; - inline const std::vector& unsqueezed_strides() const { - return unsqueezed_strides_; + private: + inline size_t nbytes_per_ubo() const { + return storage_->context_->adapter_ptr()->min_ubo_alignment(); } + size_t get_max_ubo_nbytes(const size_t nbytes_per_ubo) const; + + public: /* - * Returns a GPU buffer containing the sizes of the tensor in WHCN order. - * Note that dimensions that are not present in the tensor's sizes are set to - * a size of 1. + * The functions below return the buffer binding info for a UBO that contains + * some metadata of the tensor, which can be used to pass in tensor metadata + * to a compute shader. The other method of passing in tensor metadata is via + * push constants. The trade-off between each is that push constants may be + * slightly more performant and memory efficient; however, to update the + * values in a push constant due to i.e. a tensor resize between inferences, + * the command buffer must be re-encoded. On the other hand, UBOs can update + * their data by writing to their mapped memory without requiring a command + * buffer re-encode. */ + const vkapi::BufferBindInfo sizes_ubo(); - /* - * Returns a GPU buffer containing the strides of the tensor in WHCN order. - * Note that the strides are extended to a dimensionality that is a multiple - * of 4, thus dimensions that are not present in the tensor's sizes are set to - * have a stride equal to the stride of the "slowest moving" dimension. - */ + const vkapi::BufferBindInfo dim_order_ubo(); + const vkapi::BufferBindInfo strides_ubo(); - /* - * Returns a GPU buffer containing the logical limits of the tensor. See the - * comments for logical_limits() for more context. - */ const vkapi::BufferBindInfo logical_limits_ubo(); - /* - * Returns the number of elements in the buffer used to store the tensor. - */ const vkapi::BufferBindInfo numel_ubo(); - inline size_t numel() const { - return uniform_data_->numel; - } - - inline size_t nbytes() const { - return element_size(dtype()) * numel(); - } - - /* - * Returns numel but based on padded_sizes_ instead of sizes_ - */ - inline size_t padded_numel() const { - return padded_numel_; + public: + inline size_t staging_buffer_numel() const { + return storage_->buffer_len(); } - size_t staging_buffer_numel() const; - inline size_t staging_buffer_nbytes() const { return element_size(dtype()) * staging_buffer_numel(); } @@ -608,6 +628,8 @@ class vTensor final { }; static constexpr vTensor::Attribute kTensorSizes = vTensor::Attribute::SIZES; +static constexpr vTensor::Attribute kTensorDimOrder = + vTensor::Attribute::WHCN_DIM_ORDER; static constexpr vTensor::Attribute kTensorStrides = vTensor::Attribute::STRIDES; static constexpr vTensor::Attribute kTensorLogicalLimits = diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index 31514989dfc..21d80d5843f 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -346,6 +346,10 @@ class ComputeGraph final { return values_.at(idx).toTensor().strides_ubo(); } + inline vkapi::BufferBindInfo dim_order_ubo(const ValueRef idx) { + return values_.at(idx).toTensor().dim_order_ubo(); + } + inline vkapi::BufferBindInfo numel_ubo(const ValueRef idx) { return values_.at(idx).toTensor().numel_ubo(); } @@ -354,6 +358,10 @@ class ComputeGraph final { return values_.at(idx).toTensor().has_standard_axis_map(); } + inline bool is_contiguous(const ValueRef idx) const { + return values_.at(idx).toTensor().is_contiguous(); + } + inline vkapi::BufferBindInfo logical_limits_ubo(const ValueRef idx) { return values_.at(idx).toTensor().logical_limits_ubo(); } @@ -363,6 +371,12 @@ class ComputeGraph final { values_.at(idx).toConstTensor().get_uniform_data(), api::kTensorSizes); } + inline PushConstantDataInfo dim_order_pc_of(const ValueRef idx) const { + return PushConstantDataInfo( + values_.at(idx).toConstTensor().get_uniform_data(), + api::kTensorDimOrder); + } + inline PushConstantDataInfo strides_pc_of(const ValueRef idx) const { return PushConstantDataInfo( values_.at(idx).toConstTensor().get_uniform_data(), diff --git a/backends/vulkan/runtime/vk_api/Descriptor.cpp b/backends/vulkan/runtime/vk_api/Descriptor.cpp index 938666802ef..9e8394ffa9c 100644 --- a/backends/vulkan/runtime/vk_api/Descriptor.cpp +++ b/backends/vulkan/runtime/vk_api/Descriptor.cpp @@ -32,8 +32,8 @@ BufferBindInfo::BufferBindInfo( BufferBindInfo::BufferBindInfo( const VulkanBuffer& buffer_p, - const uint32_t offset_p, - const uint32_t range_p) + const size_t offset_p, + const size_t range_p) : handle(buffer_p.handle()), offset(buffer_p.mem_offset() + offset_p), range(range_p) { diff --git a/backends/vulkan/runtime/vk_api/Descriptor.h b/backends/vulkan/runtime/vk_api/Descriptor.h index 60d66a22619..15ea5e23e33 100644 --- a/backends/vulkan/runtime/vk_api/Descriptor.h +++ b/backends/vulkan/runtime/vk_api/Descriptor.h @@ -36,8 +36,8 @@ struct BufferBindInfo final { BufferBindInfo(const VulkanBuffer& buffer_p, const uint32_t offset_p = 0u); BufferBindInfo( const VulkanBuffer& buffer_p, - const uint32_t offset_p, - const uint32_t range_p); + const size_t offset_p, + const size_t range_p); }; struct ParamsBindList final { diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index c4ccc860bc2..17f197dfdeb 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -259,14 +259,10 @@ TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) { /*allocate_memory = */ false); ASSERT_TRUE(new_v_tensor.strides() == ref_strides); - ASSERT_TRUE( - new_v_tensor.unsqueezed_strides() == ref_unsqueezed_strides); // Resize vtensor and check that updated metadata is correct v_tensor_to_resize.virtual_reconfigure(sizes, dim_order); ASSERT_TRUE(v_tensor_to_resize.strides() == ref_strides); - ASSERT_TRUE( - v_tensor_to_resize.unsqueezed_strides() == ref_unsqueezed_strides); } } } @@ -1003,18 +999,14 @@ TEST_F(VulkanComputeAPITest, texture_virtual_resize) { b.virtual_resize(new_sizes); c.virtual_resize(new_sizes); - fill_staging( - staging_buffer_a, float(new_sizes[1] + 1.5f), a.staging_buffer_numel()); - fill_staging( - staging_buffer_b, - float(new_sizes[2] + 55.0f), - b.staging_buffer_numel()); + fill_staging(staging_buffer_a, float(new_sizes[1] + 1.5f), a.numel()); + fill_staging(staging_buffer_b, float(new_sizes[2] + 55.0f), b.numel()); submit_to_gpu(); check_staging_buffer( staging_buffer_c, float(new_sizes[1] + new_sizes[2] + 56.5f), - c.staging_buffer_numel()); + c.numel()); } } @@ -1096,7 +1088,6 @@ TEST_F(VulkanComputeAPITest, test_tensor_creation_from_vulkan_image) { const auto exp_numel = w * h * d * 4; EXPECT_TRUE(tensor.numel() == exp_numel); - EXPECT_TRUE(tensor.padded_numel() == exp_numel); } TEST(VulkanComputeGraphTest, test_values_scalars) {