diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp index 6f7167c54fb..e9437e3bd09 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.cpp +++ b/backends/vulkan/runtime/api/containers/Tensor.cpp @@ -14,6 +14,10 @@ namespace vkcompute { namespace api { +/* + * Used to infer the sizes of a tensor that would correspond to a given + * VulkanImage. + */ std::vector calculate_sizes( const vkapi::VulkanImage& image, const utils::GPUMemoryLayout memory_layout) { @@ -143,58 +147,19 @@ bool dim_order_is_valid(const std::vector& dim_order) { return sum == n * (n + 1) / 2; } -/* - * Applies the following transformations to a tensor's dim_order vector: - * 1. Reverse the order of elements so that the fastest moving dimensions are - * first. - * 2. Convert NCHW dimension indices to WHCN indices, so that 0 represents the - * width dimension, 1 represents the height dimension, and 2 represents the - * channels dimension. - * 3. Unsqueeze the dim_order vector to the next multiple of 4. - - * These transformations make it easier to use the dim order in a compute shader - */ -std::vector create_whcn_dim_order( - const std::vector& dim_order) { - size_t ndim = dim_order.size(); - std::vector whcn_order(ndim); - - // Convert from NCHW to WHCN index, and flip the dim order so that the fastest - // moving dimension is first. - // example: { 1, 2, 0} -> { 2, 0, 1} - // {height, width, channels} -> {channels, width, height} - for (size_t whcn_i = 0, nchw_i = (ndim - 1); whcn_i < ndim; - ++whcn_i, --nchw_i) { - whcn_order.at(whcn_i) = ndim - 1 - dim_order.at(nchw_i); - } - - // Unsqueeze to the next multiple of 4 - size_t ndim_up4 = utils::align_up_4(ndim); - whcn_order.resize(ndim_up4); - - // Append unsqueezed dimensions - for (size_t i = ndim; i < ndim_up4; ++i) { - whcn_order.at(i) = i; - } - - return whcn_order; -} - -std::vector unsqueeze_strides( - const std::vector& strides, - const int64_t numel) { - const size_t ndim = strides.size(); - const size_t ndim_up4 = utils::align_up_4(strides.size()); - std::vector unsqueezed_strides(ndim_up4); - for (int32_t i = 1; i <= ndim; ++i) { - int64_t dim_stride = strides.at(ndim - i); - unsqueezed_strides.at(ndim_up4 - i) = dim_stride; - } - - for (int32_t i = ndim + 1; i <= ndim_up4; ++i) { - unsqueezed_strides.at(ndim_up4 - i) = numel; - } - return unsqueezed_strides; +utils::ivec4 flip_and_unsqueeze_ivec4( + const std::vector& tensor_metadata, + const vTensor::Attribute metadata_type, + const size_t numel) { + VK_CHECK_COND(tensor_metadata.size() <= 4); + std::vector flipped_metadata = + flip_and_unsqueeze(tensor_metadata, metadata_type, numel); + return { + flipped_metadata.at(0), + flipped_metadata.at(1), + flipped_metadata.at(2), + flipped_metadata.at(3), + }; } std::vector calculate_padded_sizes( @@ -309,7 +274,8 @@ int64_t calculate_gpu_buffer_numel( return numel; } -int32_t pack_into_int32(const std::vector& vec, const int32_t extra) { +template ::value>> +int32_t pack_into_int32(const std::vector& vec, const int32_t extra) { int32_t packed = static_cast( vec.at(0) + (vec.at(1) << 4) + (vec.at(2) << 8) + (vec.at(3) << 12) + (extra << 16)); @@ -322,22 +288,24 @@ int32_t create_hashed_layout( const int32_t packed_dim, const utils::StorageType storage_type) { if (storage_type == utils::kBuffer) { - return pack_into_int32(create_whcn_dim_order(dim_order), 0); + return pack_into_int32( + flip_and_unsqueeze(dim_order, kTensorDimOrder, 0), 0); } return pack_into_int32(axis_map, packed_dim); } size_t calculate_max_ubo_nbytes( - const size_t nbytes_per_ubo, + const size_t min_nbytes_per_ubo, const utils::StorageType storage_type) { - // For texture backed tensors, the metadata fields needed are: - // sizes, logical limits - size_t max_metadata_field_count = 2u; + size_t ivec4_ubo_nbytes = utils::align_up(size_t(16), min_nbytes_per_ubo); + size_t uvec3_ubo_nbytes = utils::align_up(size_t(12), min_nbytes_per_ubo); + size_t int32_ubo_nbytes = utils::align_up(size_t(4), min_nbytes_per_ubo); if (storage_type == utils::kBuffer) { // sizes, strides, dim order, numel - max_metadata_field_count = 4u; + return 3 * ivec4_ubo_nbytes + int32_ubo_nbytes; } - return max_metadata_field_count * nbytes_per_ubo; + // sizes, logical limits + return ivec4_ubo_nbytes + uvec3_ubo_nbytes; } // @@ -595,8 +563,9 @@ vTensor::vTensor( packed_dim_, storage_type)), // Related to tensor metadata UBOs - nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()}, - max_ubo_nbytes_{calculate_max_ubo_nbytes(nbytes_per_ubo_, storage_type)}, + min_nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()}, + max_ubo_nbytes_{ + calculate_max_ubo_nbytes(min_nbytes_per_ubo_, storage_type)}, uniforms_(), // Construct Tensor storage storage_(std::make_shared( @@ -607,23 +576,13 @@ vTensor::vTensor( sizes, dtype_, allocate_memory)) { - // Derived metadata - std::vector whcn_dim_order(4, 0); - std::vector unsqueezed_strides(4, 0); - // Only calculate derived metadata if needed for the desired storage type. - // Note that logical limits may be used by buffer storage as well in order to - // set global work group sizes for some compute shaders. - if (storage_type == utils::kBuffer) { - whcn_dim_order = create_whcn_dim_order(dim_order_); - unsqueezed_strides = unsqueeze_strides(strides_, numel_); - } - uniform_data_ = std::make_shared(UniformData{ + numel_, sizes_, - whcn_dim_order, - unsqueezed_strides, - calculate_logical_limits(storage_->image_extents_, axis_map_), - numel_}); + dim_order_, + strides_, + calculate_logical_limits(storage_->image_extents_, axis_map_)}); + VK_CHECK_COND( dim_order_is_valid(dim_order_), "computed dim order is invalid"); } @@ -648,18 +607,18 @@ vTensor::vTensor( packed_dim_, utils::kTexture3D)), // Related to tensor metadata UBOs - nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()}, + min_nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()}, max_ubo_nbytes_{ - calculate_max_ubo_nbytes(nbytes_per_ubo_, utils::kTexture3D)}, + calculate_max_ubo_nbytes(min_nbytes_per_ubo_, utils::kTexture3D)}, uniforms_(), // Construct Tensor storage storage_(std::make_shared(context, image)) { uniform_data_ = std::make_shared(UniformData{ + numel_, sizes_, {0, 0, 0, 0}, {0, 0, 0, 0}, - calculate_logical_limits(storage_->image_extents_, axis_map_), - numel_}); + calculate_logical_limits(storage_->image_extents_, axis_map_)}); } vTensor::vTensor(vTensor& other) @@ -672,7 +631,7 @@ vTensor::vTensor(vTensor& other) strides_(other.strides_.begin(), other.strides_.end()), numel_(other.numel_), hashed_layout_(other.hashed_layout_), - nbytes_per_ubo_{other.nbytes_per_ubo_}, + min_nbytes_per_ubo_{other.min_nbytes_per_ubo_}, max_ubo_nbytes_{other.max_ubo_nbytes_}, uniforms_(), // Copy Tensor storage @@ -697,22 +656,35 @@ vTensor::vTensor( axis_map_, packed_dim_, other.storage_type())), - nbytes_per_ubo_{other.nbytes_per_ubo_}, + min_nbytes_per_ubo_{other.min_nbytes_per_ubo_}, max_ubo_nbytes_{other.max_ubo_nbytes_}, uniforms_(), // Copy Tensor storage storage_(other.storage_) { uniform_data_ = std::make_shared(UniformData{ + static_cast(utils::multiply_integers(sizes_)), sizes_, - create_whcn_dim_order(dim_order_), - unsqueeze_strides(strides_, numel_), - other.logical_limits(), - static_cast(utils::multiply_integers(sizes_))}); + dim_order_, + strides_, + other.logical_limits()}); VK_CHECK_COND( dim_order_is_valid(dim_order_), "new dim order provided is invalid"); } +vTensor::UniformData::UniformData( + const size_t numel_ll, + const std::vector& sizes, + const std::vector& dim_order, + const std::vector& strides, + const utils::uvec3& limits) + : numel(utils::safe_downcast(numel_ll)), + sizes_v(flip_and_unsqueeze_ivec4(sizes, kTensorSizes, numel_ll)), + dim_order_v( + flip_and_unsqueeze_ivec4(dim_order, kTensorDimOrder, numel_ll)), + strides_v(flip_and_unsqueeze_ivec4(strides, kTensorStrides, numel_ll)), + logical_limits(limits) {} + uint32_t vTensor::UniformData::write_attribute( void* dst, const uint32_t dst_offset, @@ -727,11 +699,11 @@ uint32_t vTensor::UniformData::write_attribute( return sizeof(member_name); \ } switch (attr) { + WRITE_ATTRIBUTE_CASE(NUMEL, numel); WRITE_ATTRIBUTE_CASE(SIZES, sizes_v); - WRITE_ATTRIBUTE_CASE(WHCN_DIM_ORDER, whcn_dim_order_v); + WRITE_ATTRIBUTE_CASE(WHCN_DIM_ORDER, dim_order_v); WRITE_ATTRIBUTE_CASE(STRIDES, strides_v); WRITE_ATTRIBUTE_CASE(LOGICAL_LIMITS, logical_limits); - WRITE_ATTRIBUTE_CASE(NUMEL, numel); default: VK_THROW("Invalid Attribute"); } @@ -806,84 +778,25 @@ size_t vTensor::get_max_ubo_nbytes(const size_t nbytes_per_ubo) const { } const vkapi::BufferBindInfo vTensor::sizes_ubo() { - if (!uniforms_.buffer()) { - uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true); - } - if (sizes_uniform_offset_ == kUniformOffsetUnset) { - VK_CHECK_COND( - (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_, - "Uniform data allocation has exceeded Tensor uniform buffer size"); - sizes_uniform_offset_ = uniforms_size_; - uniforms_size_ += nbytes_per_ubo_; - uniforms_.update(utils::make_whcn_ivec4(sizes_), sizes_uniform_offset_); - } - return vkapi::BufferBindInfo( - uniforms_.buffer(), sizes_uniform_offset_, nbytes_per_ubo_); + return metadata_ubo_impl(&sizes_uniform_offset_, uniform_data_->sizes_v); } const vkapi::BufferBindInfo vTensor::dim_order_ubo() { - if (!uniforms_.buffer()) { - uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true); - } - if (dim_order_uniform_offset_ == kUniformOffsetUnset) { - VK_CHECK_COND( - (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_, - "Uniform data allocation has exceeded Tensor uniform buffer size"); - dim_order_uniform_offset_ = uniforms_size_; - uniforms_size_ += nbytes_per_ubo_; - uniforms_.update( - uniform_data_->whcn_dim_order_v, dim_order_uniform_offset_); - } - return vkapi::BufferBindInfo( - uniforms_.buffer(), dim_order_uniform_offset_, nbytes_per_ubo_); + return metadata_ubo_impl( + &dim_order_uniform_offset_, uniform_data_->dim_order_v); } const vkapi::BufferBindInfo vTensor::strides_ubo() { - if (!uniforms_.buffer()) { - uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true); - } - if (strides_uniform_offset == kUniformOffsetUnset) { - VK_CHECK_COND( - (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_, - "Uniform data allocation has exceeded Tensor uniform buffer size"); - strides_uniform_offset = uniforms_size_; - uniforms_size_ += nbytes_per_ubo_; - uniforms_.update(uniform_data_->strides_v, strides_uniform_offset); - } - return vkapi::BufferBindInfo( - uniforms_.buffer(), strides_uniform_offset, nbytes_per_ubo_); + return metadata_ubo_impl(&strides_uniform_offset, uniform_data_->strides_v); } const vkapi::BufferBindInfo vTensor::logical_limits_ubo() { - if (!uniforms_.buffer()) { - uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true); - } - if (logical_limits_uniform_offset_ == kUniformOffsetUnset) { - VK_CHECK_COND( - (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_, - "Uniform data allocation has exceeded Tensor uniform buffer size"); - logical_limits_uniform_offset_ = uniforms_size_; - uniforms_size_ += nbytes_per_ubo_; - uniforms_.update(logical_limits(), logical_limits_uniform_offset_); - } - return vkapi::BufferBindInfo( - uniforms_.buffer(), logical_limits_uniform_offset_, nbytes_per_ubo_); + return metadata_ubo_impl( + &logical_limits_uniform_offset_, uniform_data_->logical_limits); } const vkapi::BufferBindInfo vTensor::numel_ubo() { - if (!uniforms_.buffer()) { - uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true); - } - if (numel_uniform_offset_ == kUniformOffsetUnset) { - VK_CHECK_COND( - (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_, - "Uniform data allocation has exceeded Tensor uniform buffer size"); - numel_uniform_offset_ = uniforms_size_; - uniforms_size_ += nbytes_per_ubo_; - uniforms_.update(numel(), numel_uniform_offset_); - } - return vkapi::BufferBindInfo( - uniforms_.buffer(), numel_uniform_offset_, nbytes_per_ubo_); + return metadata_ubo_impl(&numel_uniform_offset_, uniform_data_->numel); } VkMemoryRequirements vTensor::get_memory_requirements() const { @@ -936,13 +849,13 @@ void vTensor::update_metadata() { strides_ = calculate_strides(sizes_, dim_order_); // Update uniform data if it has been modified - uniform_data_->numel = numel_; - uniform_data_->sizes_v = utils::make_whcn_ivec4(sizes_); - uniform_data_->whcn_dim_order_v = - utils::make_ivec4(create_whcn_dim_order(dim_order_)); - uniform_data_->strides_v = - utils::make_whcn_ivec4(unsqueeze_strides(strides_, numel_)); uniform_data_->numel = utils::safe_downcast(numel_); + uniform_data_->sizes_v = + flip_and_unsqueeze_ivec4(sizes_, kTensorSizes, numel_); + uniform_data_->dim_order_v = + flip_and_unsqueeze_ivec4(dim_order_, kTensorDimOrder, numel_); + uniform_data_->strides_v = + flip_and_unsqueeze_ivec4(strides_, kTensorStrides, numel_); uniform_data_->logical_limits.limits = calculate_logical_limits(sizes_, axis_map_, packed_dim_); @@ -950,8 +863,7 @@ void vTensor::update_metadata() { uniforms_.update(uniform_data_->sizes_v, sizes_uniform_offset_); } if (dim_order_uniform_offset_ != kUniformOffsetUnset) { - uniforms_.update( - uniform_data_->whcn_dim_order_v, dim_order_uniform_offset_); + uniforms_.update(uniform_data_->dim_order_v, dim_order_uniform_offset_); } if (strides_uniform_offset != kUniformOffsetUnset) { uniforms_.update(uniform_data_->strides_v, strides_uniform_offset); diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h index bcca956e5ea..fefbd2aa71a 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.h +++ b/backends/vulkan/runtime/api/containers/Tensor.h @@ -36,10 +36,6 @@ std::vector calculate_strides( const std::vector& sizes, const std::vector& dim_order); -std::vector unsqueeze_strides( - const std::vector& strides, - const int64_t numel); - /* * When stored on the GPU, tensor data is stored using texels (i.e. a vector of * 4 scalar values) in order to take advantage of the GPU's native vectorization @@ -236,28 +232,23 @@ class vTensor final { }; class UniformData { + // Contains the number of elements in the tensor according to the canonical + // sizes. + int32_t numel; utils::ivec4 sizes_v; - utils::ivec4 whcn_dim_order_v; + utils::ivec4 dim_order_v; utils::ivec4 strides_v; // See the comments documenting logical_limits() for more context. TextureLimits logical_limits; - // Contains the number of elements in the tensor according to the canonical - // sizes. - int32_t numel; friend class vTensor; UniformData( + const size_t numel_ll, const std::vector& sizes, - const std::vector& whcn_dim_order, + const std::vector& dim_order, const std::vector& strides, - const utils::uvec3& logical_limits, - const size_t numel_ll) - : sizes_v(utils::make_whcn_ivec4(sizes)), - whcn_dim_order_v(utils::make_ivec4(whcn_dim_order)), - strides_v(utils::make_whcn_ivec4(strides)), - logical_limits(logical_limits), - numel(utils::safe_downcast(numel_ll)) {} + const utils::uvec3& limits); public: /* @@ -326,7 +317,7 @@ class vTensor final { int32_t hashed_layout_; // Pre-compute these quantities to avoid frequent re-computation - size_t nbytes_per_ubo_; + size_t min_nbytes_per_ubo_; size_t max_ubo_nbytes_; /* @@ -523,6 +514,26 @@ class vTensor final { size_t get_max_ubo_nbytes(const size_t nbytes_per_ubo) const; + template + const vkapi::BufferBindInfo metadata_ubo_impl( + uint32_t* param_buffer_offset, + const T& data) { + if (!uniforms_.buffer()) { + uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true); + } + size_t ubo_nbytes = utils::align_up(sizeof(data), min_nbytes_per_ubo_); + if (*param_buffer_offset == kUniformOffsetUnset) { + VK_CHECK_COND( + (uniforms_size_ + ubo_nbytes) <= max_ubo_nbytes_, + "Uniform data allocation has exceeded Tensor uniform buffer size"); + *param_buffer_offset = uniforms_size_; + uniforms_size_ += ubo_nbytes; + uniforms_.update(data, *param_buffer_offset); + } + return vkapi::BufferBindInfo( + uniforms_.buffer(), *param_buffer_offset, ubo_nbytes); + } + public: /* * The functions below return the buffer binding info for a UBO that contains @@ -649,5 +660,70 @@ static constexpr vTensor::Attribute kTensorLogicalLimits = vTensor::Attribute::LOGICAL_LIMITS; static constexpr vTensor::Attribute kTensorNumel = vTensor::Attribute::NUMEL; +/* + * Prepare tensor metadata vector for consumption on the GPU: + * 1. Convert NCHW dim order and indexes to WCHN dim order and indexes + * 2. Unsqueeze to the next multiple of 4 dims + * 3. Convert to requested output dtype + */ +template < + typename T, + typename std::enable_if::value, int>::type = 0> +std::vector flip_and_unsqueeze( + const std::vector& tensor_metadata, + const vTensor::Attribute metadata_type, + const size_t numel, + const int32_t fixed_ndim = -1) { + const size_t ndim = tensor_metadata.size(); + size_t ndim_up4 = + std::max(utils::align_up_4(tensor_metadata.size()), size_t(4)); + + if (fixed_ndim > 0) { + VK_CHECK_COND(fixed_ndim >= ndim); + ndim_up4 = static_cast(fixed_ndim); + } + + std::vector flipped_metadata(ndim_up4); + + for (int flipped_i = 0; flipped_i < ndim; ++flipped_i) { + T val_at_dim = + utils::safe_downcast(tensor_metadata.at(ndim - 1 - flipped_i)); + if (metadata_type == kTensorDimOrder) { + val_at_dim = utils::safe_downcast(ndim - 1 - val_at_dim); + } + flipped_metadata.at(flipped_i) = val_at_dim; + } + + switch (metadata_type) { + case kTensorStrides: + for (int unsqueezed_i = ndim; unsqueezed_i < ndim_up4; ++unsqueezed_i) { + flipped_metadata.at(unsqueezed_i) = utils::safe_downcast(numel); + } + break; + case kTensorDimOrder: + for (int unsqueezed_i = ndim; unsqueezed_i < ndim_up4; ++unsqueezed_i) { + flipped_metadata.at(unsqueezed_i) = + utils::safe_downcast(unsqueezed_i); + } + break; + // Default: unsqueeze with ones + default: + for (int unsqueezed_i = ndim; unsqueezed_i < ndim_up4; ++unsqueezed_i) { + flipped_metadata.at(unsqueezed_i) = utils::safe_downcast(1); + } + break; + } + + return flipped_metadata; +} + +/* + * Same as flip and unsqueeze, but returns the metadata as an `ivec4`. + */ +utils::ivec4 flip_and_unsqueezed_ivec4( + const std::vector& tensor_metadata, + const vTensor::Attribute metadata_type, + const size_t numel); + } // namespace api } // namespace vkcompute diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 9a857f41fde..a193d02da88 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -114,7 +114,7 @@ TEST_F(VulkanComputeAPITest, print_shader_executable_properties) { std::vector get_reference_strides( const std::vector& sizes, const utils::GPUMemoryLayout layout, - const bool unsqueezed = false) { + const bool flip_unsqueezed = false) { int64_t C = utils::val_at(-3, sizes); int64_t H = utils::val_at(-2, sizes); int64_t W = utils::val_at(-1, sizes); @@ -125,18 +125,20 @@ std::vector get_reference_strides( case utils::kWidthPacked: switch (sizes.size()) { case 1: - if (unsqueezed) - return {numel, numel, numel, 1}; + if (flip_unsqueezed) + return {1, numel, numel, numel}; return {1}; case 2: - if (unsqueezed) - return {numel, numel, W, 1}; + if (flip_unsqueezed) + return {1, W, numel, numel}; return {W, 1}; case 3: - if (unsqueezed) - return {numel, H * W, W, 1}; + if (flip_unsqueezed) + return {1, W, H * W, numel}; return {H * W, W, 1}; case 4: + if (flip_unsqueezed) + return {1, W, H * W, C * H * W}; return {C * H * W, H * W, W, 1}; default: return {}; @@ -145,18 +147,21 @@ std::vector get_reference_strides( case utils::kHeightPacked: switch (sizes.size()) { case 1: - if (unsqueezed) - return {numel, numel, numel, 1}; + if (flip_unsqueezed) + return {1, numel, numel, numel}; return {1}; case 2: - if (unsqueezed) - return {numel, numel, 1, H}; + if (flip_unsqueezed) + return {H, 1, numel, numel}; + return {1, H}; return {1, H}; case 3: - if (unsqueezed) - return {numel, H * W, 1, H}; + if (flip_unsqueezed) + return {H, 1, H * W, numel}; return {W * H, 1, H}; case 4: + if (flip_unsqueezed) + return {H, 1, W * H, C * W * H}; return {C * W * H, W * H, 1, H}; default: return {}; @@ -164,18 +169,20 @@ std::vector get_reference_strides( case utils::kChannelsPacked: switch (sizes.size()) { case 1: - if (unsqueezed) - return {numel, numel, numel, 1}; + if (flip_unsqueezed) + return {1, numel, numel, numel}; return {1}; case 2: - if (unsqueezed) - return {numel, numel, W, 1}; + if (flip_unsqueezed) + return {1, W, numel, numel}; return {W, 1}; case 3: - if (unsqueezed) - return {numel, 1, W * C, C}; + if (flip_unsqueezed) + return {C, W * C, 1, numel}; return {1, W * C, C}; case 4: + if (flip_unsqueezed) + return {C, W * C, 1, H * W * C}; return {H * W * C, 1, W * C, C}; default: return {}; @@ -184,6 +191,41 @@ std::vector get_reference_strides( return {}; } +/* + * Applies the following transformations to a tensor's dim_order vector: + * 1. Reverse the order of elements so that the fastest moving dimensions are + * first. + * 2. Convert NCHW dimension indices to WHCN indices, so that 0 represents the + * width dimension, 1 represents the height dimension, and 2 represents the + * channels dimension. + * 3. Unsqueeze the dim_order vector to the next multiple of 4. + */ +std::vector create_whcn_dim_order( + const std::vector& dim_order) { + size_t ndim = dim_order.size(); + std::vector whcn_order(ndim); + + // Convert from NCHW to WHCN index, and flip the dim order so that the fastest + // moving dimension is first. + // example: { 1, 2, 0} -> { 2, 0, 1} + // {height, width, channels} -> {channels, width, height} + for (size_t whcn_i = 0, nchw_i = (ndim - 1); whcn_i < ndim; + ++whcn_i, --nchw_i) { + whcn_order.at(whcn_i) = ndim - 1 - dim_order.at(nchw_i); + } + + // Unsqueeze to the next multiple of 4 + size_t ndim_up4 = utils::align_up_4(ndim); + whcn_order.resize(ndim_up4); + + // Append unsqueezed dimensions + for (size_t i = ndim; i < ndim_up4; ++i) { + whcn_order.at(i) = i; + } + + return whcn_order; +} + TEST_F(VulkanComputeAPITest, empty_init_shader_info_test) { vkapi::ShaderInfo empty_shader_info; EXPECT_FALSE(empty_shader_info); @@ -191,6 +233,20 @@ TEST_F(VulkanComputeAPITest, empty_init_shader_info_test) { EXPECT_TRUE(empty_shader_info.src_code.size == 0u); } +bool compare_vectors( + const std::vector& v32, + const std::vector& v64) { + if (v32.size() != v64.size()) { + return false; + } + for (size_t i = 0; i < v32.size(); ++i) { + if (static_cast(v32[i]) != v64[i]) { + return false; + } + } + return true; +} + TEST_F(VulkanComputeAPITest, calculate_dim_order_test) { // ndim, GPUMemoryLayout, expected dim order pairs std::vector>> test_cases = { @@ -238,17 +294,27 @@ TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) { std::vector dim_order = calculate_dim_order(sizes.size(), packed_dim); std::vector strides = calculate_strides(sizes, dim_order); + int64_t numel = utils::multiply_integers(sizes); + std::vector ref_strides = get_reference_strides(sizes, layout); ASSERT_TRUE(strides == ref_strides); - int64_t numel = utils::multiply_integers(sizes); std::vector unsqueezed_strides = - unsqueeze_strides(strides, numel); + flip_and_unsqueeze(strides, kTensorStrides, numel); + std::vector ref_unsqueezed_strides = get_reference_strides(sizes, layout, true); ASSERT_TRUE(unsqueezed_strides == ref_unsqueezed_strides); + std::vector whcn_dim_order = + flip_and_unsqueeze(dim_order, kTensorDimOrder, numel); + + std::vector ref_whcn_dim_order = + create_whcn_dim_order(dim_order); + + ASSERT_TRUE(whcn_dim_order == ref_whcn_dim_order); + // Create new vTensor and check that the strides are correct vTensor new_v_tensor( context(),