diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py index 90fea61318c..9333f34430e 100644 --- a/backends/vulkan/op_registry.py +++ b/backends/vulkan/op_registry.py @@ -538,8 +538,6 @@ def register_rotary_emb_op(features: OpFeatures): exir_ops.edge.aten.clone.default, exir_ops.edge.aten.permute.default, exir_ops.edge.aten.permute_copy.default, - exir_ops.edge.aten.select_copy.int, - exir_ops.edge.aten.slice_copy.Tensor, exir_ops.edge.aten.view_copy.default, ] ) @@ -551,6 +549,48 @@ def register_view_ops(features: OpFeatures): return features +# Fully featured transfer operators (i.e. operators that copy data from the input +# tensor(s) to the output tensor(s)), which have memory layout agnostic implementations +# for both texture and buffer storage types. +@update_features(exir_ops.edge.aten.cat.default) +def register_cat_op(features: OpFeatures): + features.texture_impl = TextureImplFeatures( + valid_packed_dims=all_packed_dims, + ) + features.buffer_impl = True + features.resize_fn = True + + def check_cat_node(node: torch.fx.Node) -> bool: + inputs = node.args[0] + if isinstance(inputs, (list, tuple)) and len(inputs) <= 3: + return True + + return False + + features.check_node_fn = check_cat_node + + return features + + +# Fully featured transfer operators (i.e. operators that copy data from the input +# tensor(s) to the output tensor(s)), which have memory layout agnostic implementations +# for both texture and buffer storage types. +@update_features( + [ + exir_ops.edge.aten.select_copy.int, + exir_ops.edge.aten.slice_copy.Tensor, + ] +) +def register_transfer_ops(features: OpFeatures): + features.texture_impl = TextureImplFeatures( + valid_packed_dims=all_packed_dims, + ) + features.buffer_impl = True + features.resize_fn = True + + return features + + # Ops ported from PyTorch Vulkan backend. These ops commonly support channels # packed tensors only and do not have a resize function. @update_features( @@ -588,7 +628,6 @@ def register_ported_op(features: OpFeatures): exir_ops.edge.aten.squeeze_copy.dims, exir_ops.edge.aten.unsqueeze_copy.default, # Tensor combination - exir_ops.edge.aten.cat.default, exir_ops.edge.aten.repeat.default, exir_ops.edge.aten.split_with_sizes_copy.default, exir_ops.edge.aten.split.Tensor, diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp index a85229b2b86..43ebbfecbc6 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.cpp +++ b/backends/vulkan/runtime/api/containers/Tensor.cpp @@ -143,6 +143,43 @@ bool dim_order_is_valid(const std::vector& dim_order) { return sum == n * (n + 1) / 2; } +/* + * Applies the following transformations to a tensor's dim_order vector: + * 1. Reverse the order of elements so that the fastest moving dimensions are + * first. + * 2. Convert NCHW dimension indices to WHCN indices, so that 0 represents the + * width dimension, 1 represents the height dimension, and 2 represents the + * channels dimension. + * 3. Unsqueeze the dim_order vector to the next multiple of 4. + + * These transformations make it easier to use the dim order in a compute shader + */ +std::vector create_whcn_dim_order( + const std::vector& dim_order) { + size_t ndim = dim_order.size(); + std::vector whcn_order(ndim); + + // Convert from NCHW to WHCN index, and flip the dim order so that the fastest + // moving dimension is first. + // example: { 1, 2, 0} -> { 2, 0, 1} + // {height, width, channels} -> {channels, width, height} + for (size_t whcn_i = 0, nchw_i = (ndim - 1); whcn_i < ndim; + ++whcn_i, --nchw_i) { + whcn_order.at(whcn_i) = ndim - 1 - dim_order.at(nchw_i); + } + + // Unsqueeze to the next multiple of 4 + size_t ndim_up4 = utils::align_up_4(ndim); + whcn_order.resize(ndim_up4); + + // Append unsqueezed dimensions + for (size_t i = ndim; i < ndim_up4; ++i) { + whcn_order.at(i) = i; + } + + return whcn_order; +} + std::vector unsqueeze_strides( const std::vector& strides, const int64_t numel) { @@ -212,6 +249,97 @@ utils::uvec3 calculate_image_extents( return extents; } +/* + * The physical image extents describe the size of an allocated texture resource + * i.e. how many texels in the width, height and depth axis of the image. + * However, the axis map allows a tensor logical dimension to map to a different + * physical texture axis; in essence, it describes a permutation between the + * logical width, height, channels, etc. dimensions of a tensor and the width, + * height, depth axis of a texture. + * + * The "logical extents" is simply the physical image extents permuted by the + * axis mapping. The logical extents is useful for constructing global work + * group sizes, so that it is easier to convert the global thread ID to a + * tensor index. + */ +utils::uvec3 calculate_logical_limits( + const utils::uvec3& image_extents, + const std::vector& axis_map) { + return { + image_extents[axis_map.at(0)], + image_extents[axis_map.at(1)], + image_extents[axis_map.at(2)], + }; +} + +/* + * Convenience overload of the above function to calculate logical limits + * directly from tensor sizes. + */ +utils::uvec3 calculate_logical_limits( + const std::vector& sizes, + const std::vector& axis_map, + const int32_t packed_dim) { + return calculate_logical_limits( + calculate_image_extents( + calculate_padded_sizes(sizes, packed_dim), axis_map, packed_dim), + axis_map); +} + +int64_t calculate_gpu_buffer_numel( + Context* const context, + const std::vector& sizes, + const utils::uvec3 image_extents, + const utils::StorageType storage_type, + const vkapi::ScalarType dtype) { + // For texture backed tensors, simply multiply the total number of texels by 4 + if (storage_type != utils::kBuffer) { + return image_extents[0] * image_extents[1] * image_extents[2] * 4; + } + const bool is_int8 = dtype == vkapi::kChar; + const bool int8_supported = + context->adapter_ptr()->has_full_int8_buffers_support(); + const size_t numel = utils::multiply_integers(sizes); + // For int8 tensors, if the device does not support int8 buffers, then int32 + // is used instead to represent the buffer data. Therefore the number of + // elements in the buffer is aligned to the next multiple of 4. + if (is_int8 && int8_supported) { + return utils::align_up_4(numel); + } + return numel; +} + +int32_t pack_into_int32(const std::vector& vec, const int32_t extra) { + int32_t packed = static_cast( + vec.at(0) + (vec.at(1) << 4) + (vec.at(2) << 8) + (vec.at(3) << 12) + + (extra << 16)); + return packed; +} + +int32_t create_hashed_layout( + const std::vector& dim_order, + const std::vector& axis_map, + const int32_t packed_dim, + const utils::StorageType storage_type) { + if (storage_type == utils::kBuffer) { + return pack_into_int32(create_whcn_dim_order(dim_order), 0); + } + return pack_into_int32(axis_map, packed_dim); +} + +size_t calculate_max_ubo_nbytes( + const size_t nbytes_per_ubo, + const utils::StorageType storage_type) { + // For texture backed tensors, the metadata fields needed are: + // sizes, logical limits + size_t max_metadata_field_count = 2u; + if (storage_type == utils::kBuffer) { + // sizes, strides, dim order, numel + max_metadata_field_count = 4u; + } + return max_metadata_field_count * nbytes_per_ubo; +} + // // vTensorStorage // @@ -322,14 +450,21 @@ vTensorStorage::vTensorStorage( const utils::StorageType storage_type, const std::vector& axis_map, const int32_t packed_dim, - const std::vector& padded_sizes, + const std::vector& sizes, const vkapi::ScalarType dtype, const bool allocate_memory) : context_(context), storage_type_{storage_type}, - image_extents_( - calculate_image_extents(padded_sizes, axis_map, packed_dim)), - buffer_length_{utils::multiply_integers(padded_sizes)}, + image_extents_(calculate_image_extents( + calculate_padded_sizes(sizes, packed_dim), + axis_map, + packed_dim)), + buffer_length_{calculate_gpu_buffer_numel( + context_, + sizes, + image_extents_, + storage_type, + dtype)}, buffer_offset_{0}, image_(allocate_image( context_, @@ -446,35 +581,45 @@ vTensor::vTensor( dim_order_(calculate_dim_order(sizes_.size(), packed_dim_)), axis_map_(calculate_axis_map(sizes_, axis_map_layout)), strides_(calculate_strides(sizes, dim_order_)), - padded_sizes_{calculate_padded_sizes(sizes, packed_dim_)}, - unsqueezed_strides_{ - unsqueeze_strides(strides_, utils::multiply_integers(sizes_))}, - padded_numel_(utils::multiply_integers(padded_sizes_)), + numel_(utils::multiply_integers(sizes_)), + hashed_layout_(create_hashed_layout( + dim_order_, + axis_map_, + packed_dim_, + storage_type)), + // Related to tensor metadata UBOs + nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()}, + max_ubo_nbytes_{calculate_max_ubo_nbytes(nbytes_per_ubo_, storage_type)}, uniforms_(), - // Utility Uniform Buffers that can be passed to shaders as arguments - uniforms_size_(0), - sizes_uniform_offset_(kUniformOffsetUnset), - unsqueezed_strides_offset_(kUniformOffsetUnset), - numel_uniform_offset_(kUniformOffsetUnset), - logical_limits_uniform_offset_(kUniformOffsetUnset), // Construct Tensor storage storage_(std::make_shared( context, storage_type, axis_map_, packed_dim_, - padded_sizes_, + sizes, dtype_, allocate_memory)) { + // Derived metadata + std::vector whcn_dim_order(4, 0); + std::vector unsqueezed_strides(4, 0); + // Only calculate derived metadata if needed for the desired storage type. + // Note that logical limits may be used by buffer storage as well in order to + // set global work group sizes for some compute shaders. + if (storage_type == utils::kBuffer) { + whcn_dim_order = create_whcn_dim_order(dim_order_); + unsqueezed_strides = unsqueeze_strides(strides_, numel_); + } + uniform_data_ = std::make_shared(UniformData{ sizes_, - unsqueezed_strides_, - {{0, 0, 0}}, - static_cast(utils::multiply_integers(sizes_))}); + whcn_dim_order, + unsqueezed_strides, + TextureLimits( + calculate_logical_limits(storage_->image_extents_, axis_map_)), + numel_}); VK_CHECK_COND( dim_order_is_valid(dim_order_), "computed dim order is invalid"); - - set_logical_limits(storage_->image_extents_); } // NOLINTNEXTLINE @@ -490,24 +635,23 @@ vTensor::vTensor( dim_order_(), axis_map_(calculate_axis_map(sizes_, axis_map_layout)), strides_(), - padded_sizes_(calculate_padded_sizes(sizes_, packed_dim_)), - unsqueezed_strides_(), - padded_numel_(utils::multiply_integers(padded_sizes_)), + numel_(utils::multiply_integers(sizes_)), + hashed_layout_(create_hashed_layout( + dim_order_, + axis_map_, + packed_dim_, + utils::kTexture3D)), + // Related to tensor metadata UBOs + nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()}, + max_ubo_nbytes_{ + calculate_max_ubo_nbytes(nbytes_per_ubo_, utils::kTexture3D)}, uniforms_(), - // Utility Uniform Buffers that can be passed to shaders as arguments - uniforms_size_(0), - sizes_uniform_offset_(kUniformOffsetUnset), - unsqueezed_strides_offset_(kUniformOffsetUnset), - numel_uniform_offset_(kUniformOffsetUnset), - logical_limits_uniform_offset_(kUniformOffsetUnset), // Construct Tensor storage storage_(std::make_shared(context, image)) { - uniform_data_ = std::make_shared(UniformData{ - sizes_, - {0, 0, 0, 0}, - {{0, 0, 0}}, - static_cast(utils::multiply_integers(sizes_))}); - set_logical_limits(storage_->image_extents_); + TextureLimits logical_limits( + calculate_logical_limits(storage_->image_extents_, axis_map_)); + uniform_data_ = std::make_shared( + UniformData{sizes_, {0, 0, 0, 0}, {0, 0, 0, 0}, logical_limits, numel_}); } vTensor::vTensor(vTensor& other) @@ -518,18 +662,11 @@ vTensor::vTensor(vTensor& other) dim_order_(other.dim_order_.begin(), other.dim_order_.end()), axis_map_(other.axis_map_.begin(), other.axis_map_.end()), strides_(other.strides_.begin(), other.strides_.end()), - padded_sizes_{other.padded_sizes_.begin(), other.padded_sizes_.end()}, - unsqueezed_strides_{ - other.unsqueezed_strides_.begin(), - other.unsqueezed_strides_.end()}, - padded_numel_(other.padded_numel_), + numel_(other.numel_), + hashed_layout_(other.hashed_layout_), + nbytes_per_ubo_{other.nbytes_per_ubo_}, + max_ubo_nbytes_{other.max_ubo_nbytes_}, uniforms_(), - // Empty initialize Utility Uniform Buffers - uniforms_size_(0), - sizes_uniform_offset_(kUniformOffsetUnset), - unsqueezed_strides_offset_(kUniformOffsetUnset), - numel_uniform_offset_(kUniformOffsetUnset), - logical_limits_uniform_offset_(kUniformOffsetUnset), // Copy Tensor storage storage_(other.storage_) { uniform_data_ = std::make_shared(*other.get_uniform_data()); @@ -546,22 +683,21 @@ vTensor::vTensor( dim_order_(dim_order.begin(), dim_order.end()), axis_map_(calculate_axis_map(sizes_, utils::kDefaultAxisMap)), strides_(calculate_strides(sizes_, dim_order_)), - padded_sizes_{calculate_padded_sizes(sizes, packed_dim_)}, - unsqueezed_strides_{ - unsqueeze_strides(strides_, utils::multiply_integers(sizes_))}, - padded_numel_(utils::multiply_integers(padded_sizes_)), + numel_(other.numel_), + hashed_layout_(create_hashed_layout( + dim_order_, + axis_map_, + packed_dim_, + other.storage_type())), + nbytes_per_ubo_{other.nbytes_per_ubo_}, + max_ubo_nbytes_{other.max_ubo_nbytes_}, uniforms_(), - // Empty initialize Utility Uniform Buffers - uniforms_size_(0), - sizes_uniform_offset_(kUniformOffsetUnset), - unsqueezed_strides_offset_(kUniformOffsetUnset), - numel_uniform_offset_(kUniformOffsetUnset), - logical_limits_uniform_offset_(kUniformOffsetUnset), // Copy Tensor storage storage_(other.storage_) { uniform_data_ = std::make_shared(UniformData{ sizes_, - unsqueezed_strides_, + create_whcn_dim_order(dim_order_), + unsqueeze_strides(strides_, numel_), {other.logical_limits()}, static_cast(utils::multiply_integers(sizes_))}); @@ -584,6 +720,7 @@ uint32_t vTensor::UniformData::write_attribute( } switch (attr) { WRITE_ATTRIBUTE_CASE(SIZES, sizes_v); + WRITE_ATTRIBUTE_CASE(WHCN_DIM_ORDER, whcn_dim_order_v); WRITE_ATTRIBUTE_CASE(STRIDES, strides_v); WRITE_ATTRIBUTE_CASE(LOGICAL_LIMITS, logical_limits); WRITE_ATTRIBUTE_CASE(NUMEL, numel); @@ -624,12 +761,6 @@ vkapi::VulkanBuffer& vTensor::buffer( return storage_->buffer_; } -void vTensor::set_logical_limits(const utils::uvec3& image_extents) { - uniform_data_->logical_limits.limits[0] = image_extents[axis_map_.at(0)]; - uniform_data_->logical_limits.limits[1] = image_extents[axis_map_.at(1)]; - uniform_data_->logical_limits.limits[2] = image_extents[axis_map_.at(2)]; -} - utils::GPUMemoryLayout vTensor::estimate_memory_layout() const { switch (packed_dim_) { case WHCN::kWidthDim: @@ -643,95 +774,108 @@ utils::GPUMemoryLayout vTensor::estimate_memory_layout() const { } } +bool vTensor::is_contiguous() const { + if (storage_type() != utils::kBuffer) { + return false; + } + for (size_t i = 0; i < dim_order_.size(); ++i) { + if (dim_order_.at(i) != i) { + return false; + } + } + return true; +} + +size_t vTensor::get_max_ubo_nbytes(const size_t nbytes_per_ubo) const { + // For texture backed tensors, the metadata fields needed are: + // sizes, logical limits + size_t max_metadata_field_count = 2u; + if (storage_type() == utils::kBuffer) { + // sizes, strides, dim order, numel + max_metadata_field_count = 4u; + } + return max_metadata_field_count * nbytes_per_ubo; +} + const vkapi::BufferBindInfo vTensor::sizes_ubo() { - const size_t size_per_ubo = - storage_->context_->adapter_ptr()->min_ubo_alignment(); - const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo; if (!uniforms_.buffer()) { - uniforms_ = ParamsBuffer(storage_->context_, max_ubo_size, true); + uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true); } if (sizes_uniform_offset_ == kUniformOffsetUnset) { VK_CHECK_COND( - (uniforms_size_ + size_per_ubo) <= max_ubo_size, + (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_, "Uniform data allocation has exceeded Tensor uniform buffer size"); sizes_uniform_offset_ = uniforms_size_; - uniforms_size_ += size_per_ubo; + uniforms_size_ += nbytes_per_ubo_; uniforms_.update(utils::make_whcn_ivec4(sizes_), sizes_uniform_offset_); } return vkapi::BufferBindInfo( - uniforms_.buffer(), sizes_uniform_offset_, size_per_ubo); + uniforms_.buffer(), sizes_uniform_offset_, nbytes_per_ubo_); } -const vkapi::BufferBindInfo vTensor::strides_ubo() { - const size_t size_per_ubo = - storage_->context_->adapter_ptr()->min_ubo_alignment(); - const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo; +const vkapi::BufferBindInfo vTensor::dim_order_ubo() { if (!uniforms_.buffer()) { - uniforms_ = ParamsBuffer(storage_->context_, max_ubo_size, true); + uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true); } - if (unsqueezed_strides_offset_ == kUniformOffsetUnset) { + if (dim_order_uniform_offset_ == kUniformOffsetUnset) { VK_CHECK_COND( - (uniforms_size_ + size_per_ubo) <= max_ubo_size, + (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_, "Uniform data allocation has exceeded Tensor uniform buffer size"); - unsqueezed_strides_offset_ = uniforms_size_; - uniforms_size_ += size_per_ubo; + dim_order_uniform_offset_ = uniforms_size_; + uniforms_size_ += nbytes_per_ubo_; uniforms_.update( - utils::make_whcn_ivec4(unsqueezed_strides_), - unsqueezed_strides_offset_); + uniform_data_->whcn_dim_order_v, dim_order_uniform_offset_); + } + return vkapi::BufferBindInfo( + uniforms_.buffer(), dim_order_uniform_offset_, nbytes_per_ubo_); +} + +const vkapi::BufferBindInfo vTensor::strides_ubo() { + if (!uniforms_.buffer()) { + uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true); + } + if (strides_uniform_offset == kUniformOffsetUnset) { + VK_CHECK_COND( + (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_, + "Uniform data allocation has exceeded Tensor uniform buffer size"); + strides_uniform_offset = uniforms_size_; + uniforms_size_ += nbytes_per_ubo_; + uniforms_.update(uniform_data_->strides_v, strides_uniform_offset); } return vkapi::BufferBindInfo( - uniforms_.buffer(), unsqueezed_strides_offset_, size_per_ubo); + uniforms_.buffer(), strides_uniform_offset, nbytes_per_ubo_); } const vkapi::BufferBindInfo vTensor::logical_limits_ubo() { - const size_t size_per_ubo = - storage_->context_->adapter_ptr()->min_ubo_alignment(); - const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo; if (!uniforms_.buffer()) { - uniforms_ = ParamsBuffer(storage_->context_, max_ubo_size, true); + uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true); } if (logical_limits_uniform_offset_ == kUniformOffsetUnset) { VK_CHECK_COND( - (uniforms_size_ + size_per_ubo) <= max_ubo_size, + (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_, "Uniform data allocation has exceeded Tensor uniform buffer size"); logical_limits_uniform_offset_ = uniforms_size_; - uniforms_size_ += size_per_ubo; + uniforms_size_ += nbytes_per_ubo_; uniforms_.update(logical_limits(), logical_limits_uniform_offset_); } return vkapi::BufferBindInfo( - uniforms_.buffer(), logical_limits_uniform_offset_, size_per_ubo); + uniforms_.buffer(), logical_limits_uniform_offset_, nbytes_per_ubo_); } const vkapi::BufferBindInfo vTensor::numel_ubo() { - const size_t size_per_ubo = - storage_->context_->adapter_ptr()->min_ubo_alignment(); - const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo; if (!uniforms_.buffer()) { - uniforms_ = ParamsBuffer(storage_->context_, max_ubo_size, true); + uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true); } if (numel_uniform_offset_ == kUniformOffsetUnset) { VK_CHECK_COND( - (uniforms_size_ + size_per_ubo) <= max_ubo_size, + (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_, "Uniform data allocation has exceeded Tensor uniform buffer size"); numel_uniform_offset_ = uniforms_size_; - uniforms_size_ += size_per_ubo; + uniforms_size_ += nbytes_per_ubo_; uniforms_.update(numel(), numel_uniform_offset_); } return vkapi::BufferBindInfo( - uniforms_.buffer(), numel_uniform_offset_, size_per_ubo); -} - -size_t vTensor::staging_buffer_numel() const { - const bool is_int8 = dtype_ == vkapi::kChar; - const bool int8_supported = - storage_->context_->adapter_ptr()->has_full_int8_buffers_support(); - if (is_int8 && !int8_supported) { - return utils::align_up_4(numel()); - } - if (storage_type() == utils::kBuffer) { - return numel(); - } - return padded_numel_; + uniforms_.buffer(), numel_uniform_offset_, nbytes_per_ubo_); } VkMemoryRequirements vTensor::get_memory_requirements() const { @@ -758,33 +902,36 @@ void vTensor::bind_allocation(const vkapi::Allocation& allocation) { } void vTensor::update_metadata() { + numel_ = utils::multiply_integers(sizes_); strides_ = calculate_strides(sizes_, dim_order_); - uniform_data_->numel = utils::multiply_integers(sizes_); - - padded_sizes_ = calculate_padded_sizes(sizes_, packed_dim_); - unsqueezed_strides_ = unsqueeze_strides(strides_, numel()); - padded_numel_ = utils::multiply_integers(padded_sizes_); // Update uniform data if it has been modified + uniform_data_->numel = numel_; uniform_data_->sizes_v = utils::make_whcn_ivec4(sizes_); - uniform_data_->strides_v = utils::make_whcn_ivec4(unsqueezed_strides_); - - // Calculate the image extents that would have been used to allocate a texture - // withthe current sizes, and use that to set the logical limits. - set_logical_limits( - calculate_image_extents(padded_sizes_, axis_map_, packed_dim_)); + uniform_data_->whcn_dim_order_v = + utils::make_ivec4(create_whcn_dim_order(dim_order_)); + uniform_data_->strides_v = + utils::make_whcn_ivec4(unsqueeze_strides(strides_, numel_)); + uniform_data_->numel = utils::safe_downcast(numel_); + uniform_data_->logical_limits.limits = + calculate_logical_limits(sizes_, axis_map_, packed_dim_); if (sizes_uniform_offset_ != kUniformOffsetUnset) { uniforms_.update(uniform_data_->sizes_v, sizes_uniform_offset_); } - if (unsqueezed_strides_offset_ != kUniformOffsetUnset) { - uniforms_.update(uniform_data_->strides_v, unsqueezed_strides_offset_); + if (dim_order_uniform_offset_ != kUniformOffsetUnset) { + uniforms_.update( + uniform_data_->whcn_dim_order_v, dim_order_uniform_offset_); + } + if (strides_uniform_offset != kUniformOffsetUnset) { + uniforms_.update(uniform_data_->strides_v, strides_uniform_offset); } if (numel_uniform_offset_ != kUniformOffsetUnset) { - uniforms_.update(numel(), numel_uniform_offset_); + uniforms_.update(numel_, numel_uniform_offset_); } if (logical_limits_uniform_offset_ != kUniformOffsetUnset) { - uniforms_.update(logical_limits(), logical_limits_uniform_offset_); + uniforms_.update( + uniform_data_->logical_limits.limits, logical_limits_uniform_offset_); } } @@ -792,8 +939,8 @@ void vTensor::check_sizes(const std::vector& sizes) const { if (storage_type() != utils::kBuffer) { // For texture storage check that the current texture is large enough for // the new sizes of the tensor. - utils::uvec3 virtual_extents = - calculate_image_extents(padded_sizes_, axis_map_, packed_dim_); + utils::uvec3 virtual_extents = calculate_image_extents( + calculate_padded_sizes(sizes_, packed_dim_), axis_map_, packed_dim_); bool valid_resize = virtual_extents[0] <= storage_->image_extents_[0]; valid_resize = @@ -828,6 +975,11 @@ void vTensor::virtual_reconfigure( check_sizes(new_sizes); sizes_ = new_sizes; dim_order_ = new_dim_order; + + // Update the hashed layout because dim order is updated + hashed_layout_ = + create_hashed_layout(dim_order_, axis_map_, packed_dim_, storage_type()); + update_metadata(); } @@ -837,6 +989,7 @@ void vTensor::virtual_clone(const vTensor& other) { dim_order_ = other.dim_order_; axis_map_ = other.axis_map_; packed_dim_ = other.packed_dim_; + hashed_layout_ = other.hashed_layout_; *uniform_data_ = *other.get_uniform_data(); } @@ -895,6 +1048,11 @@ void vTensor::virtual_transpose(const int64_t dim0, const int64_t dim1) { axis_map_.at(3) = dim0_whcn; } } + + // Update the hashed layout because dim order / axis mpa is updated + hashed_layout_ = + create_hashed_layout(dim_order_, axis_map_, packed_dim_, storage_type()); + update_metadata(); } diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h index 850dc2d7fab..78a24d87e77 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.h +++ b/backends/vulkan/runtime/api/containers/Tensor.h @@ -81,6 +81,18 @@ struct LastAccess { : stage{stage_flags}, access{access_flags} {} }; +/* + * Calculate the number of elements that a GPU buffer would require to store the + * contents of a tensor. This will depend on the storage type and dtype of the + * tensor, as well as the features available on the device. + */ +int64_t calculate_gpu_buffer_numel( + Context* const context, + const std::vector& sizes, + const utils::uvec3 image_extents, + const utils::StorageType storage_type, + const vkapi::ScalarType dtype); + class vTensorStorage final { public: // Do not allow empty vTensorStorage construction @@ -91,7 +103,7 @@ class vTensorStorage final { const utils::StorageType storage_type, const std::vector& axis_map, const int32_t packed_dim, - const std::vector& padded_sizes, + const std::vector& sizes, const vkapi::ScalarType dtype, const bool allocate_memory = true); @@ -140,6 +152,10 @@ class vTensorStorage final { void verify() const; public: + inline size_t buffer_len() const { + return utils::safe_downcast(buffer_length_); + } + inline VkFormat texture_format() { return image_.format(); } @@ -207,8 +223,11 @@ class vTensor final { vTensor(vTensor&& other) = default; vTensor& operator=(vTensor&& other) = default; + ~vTensor() = default; + enum class Attribute : uint8_t { SIZES, + WHCN_DIM_ORDER, STRIDES, LOGICAL_LIMITS, NUMEL, @@ -216,6 +235,7 @@ class vTensor final { class UniformData { utils::ivec4 sizes_v; + utils::ivec4 whcn_dim_order_v; utils::ivec4 strides_v; // See the comments documenting logical_limits() for more context. TextureLimits logical_limits; @@ -227,10 +247,12 @@ class vTensor final { UniformData( const std::vector& sizes, + const std::vector& whcn_dim_order, const std::vector& strides, const TextureLimits& logical_limits, const size_t numel_ll) : sizes_v(utils::make_whcn_ivec4(sizes)), + whcn_dim_order_v(utils::make_ivec4(whcn_dim_order)), strides_v(utils::make_whcn_ivec4(strides)), logical_limits(logical_limits), numel(utils::safe_downcast(numel_ll)) {} @@ -293,21 +315,17 @@ class vTensor final { // strides of the tensor in NCHW dimension order std::vector strides_; - /* - * The below metadata members are derived from the above, and are typically - * to i.e. pass tensor metadata to compute shaders. - */ + // number of elements based on the canonical sizes + size_t numel_; + + // For texture backed tensors, this int32 contains the axis map data packed + // into a single int32. For buffer backed tensors, this int32 contains the + // wchn dim order data packed into a single int32. + int32_t hashed_layout_; - // padded sizes of the tensor in NCHW dimension order. See the - // calculate_padded_sizes() function for more context. Note that padded sizes - // are only used for texture storage, and not for buffer storage. - std::vector padded_sizes_; - // Contains the strides of the tensor, with the dimensionality padded to the - // nearest multiple of 4. Unsqueezed dims will have a stride of int32_t max. - std::vector unsqueezed_strides_; - // Contains the number of elements in the tensor according to the padded - // sizes. - size_t padded_numel_; + // Pre-compute these quantities to avoid frequent re-computation + size_t nbytes_per_ubo_; + size_t max_ubo_nbytes_; /* * Utility GPU buffer that can be passed to shaders in order to convey tensor @@ -320,15 +338,13 @@ class vTensor final { * context about the data contained in each buffer. */ ParamsBuffer uniforms_; - uint32_t uniforms_size_; - uint32_t sizes_uniform_offset_; - uint32_t unsqueezed_strides_offset_; - uint32_t numel_uniform_offset_; - uint32_t logical_limits_uniform_offset_; - // Maximum number of metadata fields that can be stored in the metadata UBO. - // This is used to calculate the size of the UBO that should be allocated. - constexpr static size_t kMaxMetadataFieldCount = 4; + uint32_t uniforms_size_ = 0u; + uint32_t sizes_uniform_offset_ = kUniformOffsetUnset; + uint32_t dim_order_uniform_offset_ = kUniformOffsetUnset; + uint32_t strides_uniform_offset = kUniformOffsetUnset; + uint32_t numel_uniform_offset_ = kUniformOffsetUnset; + uint32_t logical_limits_uniform_offset_ = kUniformOffsetUnset; // Initial value of uniform buffer offsets. 1 is selected as it is essentially // impossible for a ubo to have an offset of 1. @@ -381,9 +397,6 @@ class vTensor final { return storage_->storage_type_ == utils::kBuffer; } - private: - void set_logical_limits(const utils::uvec3& image_extents); - public: /* * The logical limits of the tensor are derived from the image extents of the @@ -451,21 +464,37 @@ class vTensor final { return dim_order_; } + inline const std::vector& strides() const { + return strides_; + } + + inline size_t numel() const { + return numel_; + } + + inline size_t nbytes() const { + return element_size(dtype()) * numel(); + } + inline const std::vector& axis_map() const { return axis_map_; } /* - * Returns a single int32_t that contains the values of the axis map and the - * packed dimension packed into a single int32_t, such that it can be used as - * a specialization constant in a compute shader. This allows for the SPIR-V - * to bytecode compilation to perform compile-time unfolding on the axis map. - * Each element of the axis map and the value of the packed dimension take up - * 4 bits in the packed int32_t. + * For texture backed tensors, this function return a int32_t that contains + * the axis map + packed dimension. Each element of the axis map occupies 4 + * bits of the int32. + * + * For buffer backed tensors, the int32_t contains the WHCN dim order, where + * each element of the dim order array occupies 4 bits of the int32. + * + * This int32 is typically consumed as a specialization constant in compute + * shaders where it is subsequently unpacked. The layout data of a vTensor + * instance is typically static once created, which is why this method is + * appropriate. */ inline int32_t hashed_layout() const { - return axis_map_.at(0) + (axis_map_.at(1) << 4) + (axis_map_.at(2) << 8) + - (axis_map_.at(3) << 12) + (packed_dim_ << 16); + return hashed_layout_; } /* @@ -478,57 +507,48 @@ class vTensor final { return axis_map_.at(0) == 0 && axis_map_.at(1) == 1 && axis_map_.at(2) == 2; } - inline const std::vector& strides() const { - return strides_; - } + /* + * Return true if a buffer backed tensor's dim order matches that of a + * contiguous tensor, i.e. the dim order will be {0, 1, 2, ... }. + * Returns false for texture backed tensors. + */ + bool is_contiguous() const; - inline const std::vector& unsqueezed_strides() const { - return unsqueezed_strides_; + private: + inline size_t nbytes_per_ubo() const { + return storage_->context_->adapter_ptr()->min_ubo_alignment(); } + size_t get_max_ubo_nbytes(const size_t nbytes_per_ubo) const; + + public: /* - * Returns a GPU buffer containing the sizes of the tensor in WHCN order. - * Note that dimensions that are not present in the tensor's sizes are set to - * a size of 1. + * The functions below return the buffer binding info for a UBO that contains + * some metadata of the tensor, which can be used to pass in tensor metadata + * to a compute shader. The other method of passing in tensor metadata is via + * push constants. The trade-off between each is that push constants may be + * slightly more performant and memory efficient; however, to update the + * values in a push constant due to i.e. a tensor resize between inferences, + * the command buffer must be re-encoded. On the other hand, UBOs can update + * their data by writing to their mapped memory without requiring a command + * buffer re-encode. */ + const vkapi::BufferBindInfo sizes_ubo(); - /* - * Returns a GPU buffer containing the strides of the tensor in WHCN order. - * Note that the strides are extended to a dimensionality that is a multiple - * of 4, thus dimensions that are not present in the tensor's sizes are set to - * have a stride equal to the stride of the "slowest moving" dimension. - */ + const vkapi::BufferBindInfo dim_order_ubo(); + const vkapi::BufferBindInfo strides_ubo(); - /* - * Returns a GPU buffer containing the logical limits of the tensor. See the - * comments for logical_limits() for more context. - */ const vkapi::BufferBindInfo logical_limits_ubo(); - /* - * Returns the number of elements in the buffer used to store the tensor. - */ const vkapi::BufferBindInfo numel_ubo(); - inline size_t numel() const { - return uniform_data_->numel; - } - - inline size_t nbytes() const { - return element_size(dtype()) * numel(); - } - - /* - * Returns numel but based on padded_sizes_ instead of sizes_ - */ - inline size_t padded_numel() const { - return padded_numel_; + public: + inline size_t staging_buffer_numel() const { + return storage_->buffer_len(); } - size_t staging_buffer_numel() const; - inline size_t staging_buffer_nbytes() const { return element_size(dtype()) * staging_buffer_numel(); } @@ -608,6 +628,8 @@ class vTensor final { }; static constexpr vTensor::Attribute kTensorSizes = vTensor::Attribute::SIZES; +static constexpr vTensor::Attribute kTensorDimOrder = + vTensor::Attribute::WHCN_DIM_ORDER; static constexpr vTensor::Attribute kTensorStrides = vTensor::Attribute::STRIDES; static constexpr vTensor::Attribute kTensorLogicalLimits = diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index 31514989dfc..21d80d5843f 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -346,6 +346,10 @@ class ComputeGraph final { return values_.at(idx).toTensor().strides_ubo(); } + inline vkapi::BufferBindInfo dim_order_ubo(const ValueRef idx) { + return values_.at(idx).toTensor().dim_order_ubo(); + } + inline vkapi::BufferBindInfo numel_ubo(const ValueRef idx) { return values_.at(idx).toTensor().numel_ubo(); } @@ -354,6 +358,10 @@ class ComputeGraph final { return values_.at(idx).toTensor().has_standard_axis_map(); } + inline bool is_contiguous(const ValueRef idx) const { + return values_.at(idx).toTensor().is_contiguous(); + } + inline vkapi::BufferBindInfo logical_limits_ubo(const ValueRef idx) { return values_.at(idx).toTensor().logical_limits_ubo(); } @@ -363,6 +371,12 @@ class ComputeGraph final { values_.at(idx).toConstTensor().get_uniform_data(), api::kTensorSizes); } + inline PushConstantDataInfo dim_order_pc_of(const ValueRef idx) const { + return PushConstantDataInfo( + values_.at(idx).toConstTensor().get_uniform_data(), + api::kTensorDimOrder); + } + inline PushConstantDataInfo strides_pc_of(const ValueRef idx) const { return PushConstantDataInfo( values_.at(idx).toConstTensor().get_uniform_data(), diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl index ce986d4e12f..a0a235154a0 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl @@ -48,19 +48,18 @@ $else: layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; +${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} +${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} +${layout_declare_spec_const(C, "int", "other_layout", "DEFAULT_LAYOUT")} + $if STORAGE == "buffer": - ${layout_declare_spec_const(C, "int", "out_packed_dim", "DEFAULT_LAYOUT")} - ${layout_declare_spec_const(C, "int", "in_packed_dim", "DEFAULT_LAYOUT")} - ${layout_declare_spec_const(C, "int", "other_packed_dim", "DEFAULT_LAYOUT")} + const lowp ivec4 out_dim_order = unhash_dim_order(out_layout); $else: - ${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); const lowp int packed_dim = unhash_packed_dim(out_layout); - ${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); - ${layout_declare_spec_const(C, "int", "other_layout", "DEFAULT_LAYOUT")} const lowp ivec4 other_axis_map = unhash_axis_map(other_layout); #ifdef USING_BUFFER @@ -77,7 +76,7 @@ void main() { return; } - const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim); + const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order); const ivec4 in_tidx = min(out_tidx, in_sizes - 1); const ivec4 other_tidx = min(out_tidx, other_sizes - 1); diff --git a/backends/vulkan/runtime/graph/ops/glsl/concat_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/concat_buffer.glsl new file mode 100644 index 00000000000..895cecb413a --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/concat_buffer.glsl @@ -0,0 +1,69 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#define VEC4_T ${texel_type(DTYPE)} +#define T ${buffer_scalar_type(DTYPE)} + +${define_active_storage_type("buffer")} +${define_required_extensions(DTYPE)} + +layout(std430) buffer; + +#include "indexing_utils.h" + +${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")} + +$for i in range(NUM_INPUTS): + ${layout_declare_tensor(B, "r", "t_in" + str(i + 1), DTYPE, "buffer")} + +${layout_declare_ubo(B, "int", "concat_dim")} + +${layout_declare_ubo(B, "ivec4", "out_sizes")} +${layout_declare_ubo(B, "ivec4", "out_strides")} + +$for i in range(NUM_INPUTS): + ${layout_declare_ubo(B, "ivec4", "in" + str(i+1) + "_sizes")} + ${layout_declare_ubo(B, "ivec4", "in" + str(i+1) + "_strides")} + +${layout_declare_ubo(B, "int", "out_numel")} + +${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} + +const lowp ivec4 out_dim_order = unhash_dim_order(out_layout); + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() { + const int out_bufi = ivec3(gl_GlobalInvocationID).x; + if (out_bufi >= out_numel) { + return; + } + + // Convert buffer linear index to 4-D tensor index for output + const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order); + + // Determine which input tensor to read from + ivec4 in_tidx = out_tidx; + + $for i in range(NUM_INPUTS): + // Check if the index at the concat dim is within bounds of the input tensor + // If so, read from that input tensor and write to output + if (in_tidx[concat_dim] < in${i+1}_sizes[concat_dim]) { + int in_bufi = tidx_to_bufi(in_tidx, in${i+1}_strides); + t_out[out_bufi] = t_in${i+1}[in_bufi]; + return; + } + // otherwise, decrement the index at the concat dim + else { + in_tidx[concat_dim] -= in${i+1}_sizes[concat_dim]; + } +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/concat_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/concat_buffer.yaml new file mode 100644 index 00000000000..39f96df5e90 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/concat_buffer.yaml @@ -0,0 +1,14 @@ +concat_buffer: + parameter_names_with_default_values: + DTYPE: float + NUM_INPUTS: 2 + generate_variant_forall: + DTYPE: + - VALUE: half + - VALUE: float + shader_variants: + - NAME: concat_1_buffer + NUM_INPUTS: 1 + - NAME: concat_2_buffer + - NAME: concat_3_buffer + NUM_INPUTS: 3 diff --git a/backends/vulkan/runtime/graph/ops/glsl/concat_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/concat_texture.glsl new file mode 100644 index 00000000000..dac6266bf67 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/concat_texture.glsl @@ -0,0 +1,129 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#define VEC4_T ${texel_type(DTYPE)} +#define T ${buffer_scalar_type(DTYPE)} + +#define USING_TEXTURE3D + +layout(std430) buffer; + +#include "indexing_utils.h" + +${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")} + +$for i in range(NUM_INPUTS): + ${layout_declare_tensor(B, "r", "t_in" + str(i + 1), DTYPE, "texture3d")} + +${layout_declare_ubo(B, "int", "concat_dim")} + +$in_metadata = "" +$for i in range(NUM_INPUTS): + $in_metadata += "ivec4 in" + str(i + 1) + "_sizes;\n" + +layout(push_constant) uniform restrict Block { + ivec4 out_sizes; + ${in_metadata} +}; + +${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} +const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); +const lowp int out_packed_dim = unhash_packed_dim(out_layout); + +$for i in range(NUM_INPUTS): + ${layout_declare_spec_const(C, "int", "in" + str(i+1) + "_layout", "DEFAULT_LAYOUT")} + const lowp ivec4 in${i+1}_axis_map = unhash_axis_map(in${i+1}_layout); + const lowp int in${i+1}_packed_dim = unhash_packed_dim(in${i+1}_layout); + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +// Check if we can use the fast path (no texel merging required) +bool can_use_fast_path() { + // Fast path is possible when: + // 1. The concat dimension is not the packed dimension, or + // 2. The concat dimension is the packed dimension but both input tensors have dimensions + // that are multiples of 4 along the packed dimension + if (concat_dim != out_packed_dim) { + return true; + } + + // Check if all input tensors have dimensions that are multiples of 4 along the packed dimension + bool all_concat_dim_size_multiple_of_4 = true; + $for i in range(NUM_INPUTS): + all_concat_dim_size_multiple_of_4 = + all_concat_dim_size_multiple_of_4 && + (in${i+1}_sizes[concat_dim] % 4 == 0); + + return all_concat_dim_size_multiple_of_4; +} + +void main() { + const ivec3 lpos = ivec3(gl_GlobalInvocationID); + ivec4 out_tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, out_packed_dim); + + if (any(greaterThanEqual(out_tidx, out_sizes))) { + return; + } + + if (can_use_fast_path()) { + // Fast path: No texel merging required + ivec4 in_tidx = out_tidx; + + $for i in range(NUM_INPUTS): + // For each input tensor, check if the tensor index is within bounds. If + // so, read the texel from the input tensor and write it to the output + if (in_tidx[concat_dim] < in${i+1}_sizes[concat_dim]) { + const ivec3 in_pos = tidx_to_pos(in_tidx, in${i+1}_sizes, in${i+1}_axis_map, in${i+1}_packed_dim); + const VEC4_T in_texel = load_texel(t_in${i+1}, in_pos); + write_texel_lpos(t_out, lpos, in_texel, out_axis_map); + return; + } + // Otherwise, adjust the index along the concat dimension and try the next + // input tensor. + else { + in_tidx[concat_dim] -= in${i+1}_sizes[concat_dim]; + } + } + else { + // Slow path: Texel merging required + VEC4_T out_texel = VEC4_T(0); + + // Process each element in the output texel individually + for (int texel_i = 0; texel_i < 4; ++texel_i) { + ivec4 curr_out_tidx = out_tidx; + curr_out_tidx[out_packed_dim] += texel_i; + + // Skip if we're out of bounds + if (curr_out_tidx[out_packed_dim] >= out_sizes[out_packed_dim]) { + continue; + } + + ivec4 in_tidx = curr_out_tidx; + $for i in range(NUM_INPUTS): + // For each input tensor, check if the tensor index is within bounds. If + // so, read the corresponding texel element from the input tensor and + // write it to the output texel. + if (in_tidx[concat_dim] < in${i+1}_sizes[concat_dim]) { + const ivec4 in_posi = tidx_to_posi(in_tidx, in${i+1}_sizes, in${i+1}_axis_map, in${i+1}_packed_dim); + out_texel[texel_i] = load_texel(t_in${i+1}, in_posi.xyz)[in_posi.w]; + continue; + } + // Otherwise, adjust the index along the concat dimension and try the + // next input tensor. + else { + in_tidx[concat_dim] -= in${i+1}_sizes[concat_dim]; + } + } + + write_texel_lpos(t_out, lpos, out_texel, out_axis_map); + } +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/concat_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/concat_texture.yaml new file mode 100644 index 00000000000..ed5003382a1 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/concat_texture.yaml @@ -0,0 +1,14 @@ +concat_texture: + parameter_names_with_default_values: + DTYPE: float + NUM_INPUTS: 2 + generate_variant_forall: + DTYPE: + - VALUE: half + - VALUE: float + shader_variants: + - NAME: concat_1_texture3d + NUM_INPUTS: 1 + - NAME: concat_2_texture3d + - NAME: concat_3_texture3d + NUM_INPUTS: 3 diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h index 2b41d2b7e1a..0cfd7f2f119 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h +++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h @@ -68,21 +68,6 @@ */ #define mod4(x) ((x) & 3) -/* - * Find the packed dimension of a tensor given its strides. The packed dimension - * is the "fastest moving" dimension which will have a stride of 1. - */ -int find_packed_dim(const ivec4 strides) { - int packed_dim = 0; - for (int i = 0; i <= 3; i++) { - if (strides[i] == 1) { - packed_dim = i; - break; - } - } - return packed_dim; -} - /* * Get the staging buffer indices that contain the data of the texel that * corresponds to the provided tensor index. Since the texel have 4 elements, @@ -129,27 +114,26 @@ int tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes) { tidx.x; } -// TODO(ssjia): make this function use dim order so that it can work with any -// dim order. Currently it assumes that the dim order is contiguous, except for -// the packed dim. -ivec4 bufi_to_tidx(int bufi, const ivec4 strides, const int packed_dim) { +ivec4 bufi_to_tidx(int bufi, const ivec4 strides, const ivec4 dim_order) { ivec4 idx; for (int i = 3; i >= 0; i--) { - if (i != packed_dim) { - idx[i] = bufi / strides[i]; - bufi %= strides[i]; - } + int dim = dim_order[i]; + idx[dim] = bufi / strides[dim]; + bufi %= strides[dim]; } - idx[packed_dim] = bufi; return idx; } -// Convenience overload of the above function, which will determine the packed -// dim from the strides automatically so it doesn't have to be passed in as a -// function argument. -ivec4 bufi_to_tidx(const int bufi, const ivec4 strides) { - int packed_dim = find_packed_dim(strides); - return bufi_to_tidx(bufi, strides, packed_dim); +/* + * bufi_to_tidx but assumes that the tensor is contiguous + */ +ivec4 contiguous_bufi_to_tidx(int bufi, const ivec4 strides) { + ivec4 idx; + for (int i = 3; i >= 0; i--) { + idx[i] = bufi / strides[i]; + bufi %= strides[i]; + } + return idx; } int tidx_to_bufi(const ivec4 tidx, ivec4 strides) { @@ -269,12 +253,22 @@ ivec3 lpos_to_pos(const ivec3 lpos, const ivec4 axis_map) { * e.g. 0x11021, 1 -> ivec4(1, 2, 0, 1) */ #define unhash_axis_map(hash) \ - ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf)) + (ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf))) + +/* + * + */ +#define unhash_dim_order(hash) \ + (ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf))) #define unhash_packed_dim(hash) int(hash >> 16 & 0xf) #define DEFAULT_LAYOUT 0x02210 +#define DEFAULT_DIM_ORDER 0x03210 + +#define DEFAULT_DIM_ORDER_IVEC4 ivec4(0, 1, 2, 3) + /************************ * Deprecated Functions * ************************/ diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl index dfb5f1f2f9c..4dd83f0d4ed 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl @@ -62,7 +62,7 @@ void main() { return; } - const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, 0); + const ivec4 out_tidx = contiguous_bufi_to_tidx(out_bufi, out_strides); const FLOAT_T scale = t_scales[out_tidx.x]; diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl index ba4e4dd9dd9..62cd0610ffb 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl @@ -10,8 +10,8 @@ ${define_required_extensions(DTYPE)} layout(std430) buffer; -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(1, "r", "nchw_in", DTYPE, STORAGE)} +${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} +${layout_declare_tensor(B, "r", "nchw_in", DTYPE, STORAGE)} $if USE_PUSH_CONST: layout(push_constant) uniform restrict Block { @@ -20,15 +20,14 @@ $if USE_PUSH_CONST: int numel; }; $else: - ${layout_declare_ubo(2, "ivec4", "out_sizes")} - ${layout_declare_ubo(3, "ivec4", "out_strides")} - ${layout_declare_ubo(4, "int", "numel")} + ${layout_declare_ubo(B, "ivec4", "out_sizes")} + ${layout_declare_ubo(B, "ivec4", "out_strides")} + ${layout_declare_ubo(B, "int", "numel")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; -// This constant is unused in this shader but is kept so that the signature is -// consistent with nchw_to_image. -${layout_declare_spec_const(C, "int", "UNUSED_layout", "0")} +${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_DIM_ORDER")} +const lowp ivec4 out_dim_order = unhash_dim_order(out_layout); ${layout_declare_spec_const(C, "int", "transpose_hw", "0")} void main() { @@ -37,7 +36,7 @@ void main() { return; } - ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides); + ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order); ivec4 sizes = out_sizes; if (transpose_hw == 1) { diff --git a/backends/vulkan/runtime/graph/ops/glsl/select.glslh b/backends/vulkan/runtime/graph/ops/glsl/select.glslh index 3bcbf04a3ba..6509015b4b6 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/select.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/select.glslh @@ -9,6 +9,8 @@ #ifndef SELECT_GLSLH #define SELECT_GLSLH +#ifndef USING_BUFFER + /* * Enable the fast path if a texel loaded from the input texture can be used as * is to store to the output texture. The following conditions must be met: @@ -29,6 +31,8 @@ bool can_use_fast_path() { return true; } +#endif // USING_BUFFER + /* * Given an output tensor index, return the corresponding input tensor index for * the select operator. This is done by "inserting" the select index at the diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh index 5d4cc70fdc1..87325754f4d 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh @@ -9,6 +9,8 @@ #ifndef SLICE_GLSLH #define SLICE_GLSLH +#ifndef USING_BUFFER + /** * Enable the fast path if a texel loaded from the input texture can be used as * is to store to the output texture. The following conditions must be met: @@ -26,6 +28,8 @@ bool can_use_fast_path() { return true; } +#endif // USING_BUFFER + /* * Converts output tensor indices to input tensor indices for the slice operation. * This function maps the output indices to the corresponding input indices based on diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl index 3ca854e0526..7e95b52d8f4 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl @@ -37,8 +37,10 @@ layout(push_constant) uniform restrict Block { int selected_dim; }; -${layout_declare_spec_const(C, "int", "out_packed_dim", "DEFAULT_LAYOUT")} -${layout_declare_spec_const(C, "int", "in_packed_dim", "DEFAULT_LAYOUT")} +${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} +${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} + +const lowp ivec4 out_dim_order = unhash_dim_order(out_layout); layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -50,7 +52,7 @@ void main() { return; } - const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim); + const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order); ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx); const int in_bufi = tidx_to_bufi(in_tidx, in_strides); diff --git a/backends/vulkan/runtime/graph/ops/glsl/where.glsl b/backends/vulkan/runtime/graph/ops/glsl/where.glsl index 5df813d1241..fe6304c0fa0 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/where.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/where.glsl @@ -37,40 +37,28 @@ $if STORAGE == "buffer": ${layout_declare_ubo(B, "ivec4", "cond_strides")} ${layout_declare_ubo(B, "ivec4", "self_strides")} ${layout_declare_ubo(B, "ivec4", "other_strides")} - - ${layout_declare_spec_const(C, "int", "out_packed_dim", "DEFAULT_LAYOUT")} - ${layout_declare_spec_const(C, "int", "cond_packed_dim", "DEFAULT_LAYOUT")} - ${layout_declare_spec_const(C, "int", "self_packed_dim", "DEFAULT_LAYOUT")} - ${layout_declare_spec_const(C, "int", "other_packed_dim", "DEFAULT_LAYOUT")} $else: ${layout_declare_ubo(B, "ivec3", "out_limits")} +${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_DIM_ORDER")} + +const lowp ivec4 out_dim_order = unhash_dim_order(out_layout); + layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; #ifdef USING_BUFFER void main() { int out_bufi = int(gl_GlobalInvocationID.x); - // ivec4 tidx = ivec4(gl_GlobalInvocationID, 0); - // int out_bufi = tidx_to_bufi(tidx, out_strides); - // int cond_bufi = tidx_to_bufi(tidx, cond_strides); - // int self_bufi = tidx_to_bufi(tidx, self_strides); - // int other_bufi = tidx_to_bufi(tidx, other_strides); if (out_bufi >= out_numl) { return; } - const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim); - out_bufi = tidx_to_bufi(out_tidx, out_strides); - - const ivec4 cond_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim); - const int cond_bufi = tidx_to_bufi(cond_tidx, cond_strides); - - const ivec4 self_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim); - const int self_bufi = tidx_to_bufi(self_tidx, self_strides); + const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order); - const ivec4 other_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim); - const int other_bufi = tidx_to_bufi(other_tidx, other_strides); + const int cond_bufi = tidx_to_bufi(out_tidx, cond_strides); + const int self_bufi = tidx_to_bufi(out_tidx, self_strides); + const int other_bufi = tidx_to_bufi(out_tidx, other_strides); COND_T cond = t_condition[cond_bufi] ; T v_self = t_self[self_bufi]; diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp index d260ed767d0..28279c196c0 100644 --- a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp @@ -143,9 +143,9 @@ void add_binary_op_buffer_node( PushConstantDataInfo(&alpha_val, sizeof(float)), }}, // Specialization Constants - {graph.packed_dim_of(out), - graph.packed_dim_of(in1), - graph.packed_dim_of(in2)}, + {graph.hashed_layout_of(out), + graph.hashed_layout_of(in1), + graph.hashed_layout_of(in2)}, // Resize Args {}, // Resizing Logic diff --git a/backends/vulkan/runtime/graph/ops/impl/Cat.cpp b/backends/vulkan/runtime/graph/ops/impl/Cat.cpp deleted file mode 100644 index 25a0ff9a7f5..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Cat.cpp +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include -#include -#include -#include - -namespace vkcompute { - -void add_cat_default_node( - ComputeGraph& graph, - ValueRef in_list_ref, - ValueRef dim_ref, - ValueRef out) { - ValueListPtr input_list = graph.get_value_list(in_list_ref); - int64_t dim = graph.extract_scalar(dim_ref); - vTensorPtr t_out = graph.get_tensor(out); - - const auto packed_dim = t_out->packed_dim(); - const auto packed_dim_index = static_cast(kWidth4D - packed_dim); - - DimIndex dim_index = normalize_to_dim_index(*t_out, dim); - // Index of dimension to be concatenated in (w, h, c * b) coordinate system - const auto dim_xyz_index = std::min(2, -dim_index - 1); - - if (dim_index > kWidth4D || dim_index < kBatch4D) { - VK_THROW("Unexpected value of dim_index=", dim_index); - } - - utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false); - utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false); - - const bool is_concat_channel = (dim_index == kChannel4D); - - // if concatenating channels - if (is_concat_channel) { - // set destination offset w as channel size of the output tensor - dst_offset[3] = dim_at(t_out->sizes(), kChannel4D); - } - - for (ValueRef input_ref : *input_list) { - const vTensorPtr t_in = graph.get_tensor(input_ref); - const utils::ivec3 range = t_in->logical_limits(); - const auto in_channel_size = dim_at(t_in->sizes(), kChannel4D); - // if concatenating same dimension as the packed dimension - if (dim_index == packed_dim_index) { - // if concatenating channels, use add_copy_channel_offset_node function as - // add_copy_packed_dim_offset_node does not support channel packing - if (is_concat_channel) { - add_copy_channel_offset_node( - graph, - input_ref, - in_channel_size, - src_offset[2], - dst_offset[2], - out); - dst_offset[dim_xyz_index] += in_channel_size; - } else { - // src_offset[3] is not used now but will be used in the future when - // add_copy_packed_dim_offset_node will support channel packing - // - // set source offset w as channel size of the output tensor if - // concatenating channels - src_offset[3] = is_concat_channel ? in_channel_size : 0; - add_copy_packed_dim_offset_node( - graph, input_ref, range, src_offset, dst_offset, out); - dst_offset[dim_xyz_index] += dim_at(t_in->sizes(), packed_dim_index); - } - } else { - // set source offset w as channel size of the output tensor if - // concatenating channels - src_offset[3] = is_concat_channel ? in_channel_size : 0; - add_copy_offset_node( - graph, input_ref, range, src_offset, dst_offset, out, true, false); - dst_offset[dim_xyz_index] += - is_concat_channel ? in_channel_size : range[dim_xyz_index]; - } - } -} - -void cat_default(ComputeGraph& graph, const std::vector& args) { - add_cat_default_node(graph, args[0], args[1], args[2]); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.cat.default, cat_default); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Concat.cpp b/backends/vulkan/runtime/graph/ops/impl/Concat.cpp new file mode 100644 index 00000000000..315dabdb1d5 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Concat.cpp @@ -0,0 +1,168 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include +#include +#include + +namespace vkcompute { + +std::vector get_concat_sizes( + ComputeGraph& graph, + const std::vector& in_value_refs, + const int64_t dim) { + // Get the sizes of the first input tensor as a starting point + std::vector new_out_sizes = graph.sizes_of(in_value_refs.at(0)); + + // Sum up the sizes along the concatenation dimension + for (size_t i = 1; i < in_value_refs.size(); ++i) { + const std::vector in_sizes = graph.sizes_of(in_value_refs.at(i)); + new_out_sizes.at(dim) += in_sizes.at(dim); + } + + return new_out_sizes; +} + +void resize_concat_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& extra_args) { + // Extract relevant ValueRefs + const ValueRef out_ref = args.at(0).refs.at(0); + const std::vector& in_value_refs = args.at(1).refs; + + int64_t dim = graph->extract_scalar(extra_args.at(0)); + + // Normalize dim if negative + const int64_t ndim = graph->dim_of(out_ref); + if (dim < 0) { + dim += ndim; + } + + // Calculate the new sizes + std::vector new_out_sizes = + get_concat_sizes(*graph, in_value_refs, dim); + + // Resize the output tensor + graph->virtual_resize(out_ref, new_out_sizes); +} + +void add_concat_node( + ComputeGraph& graph, + const ValueRef tensors_ref, + const ValueRef dim_ref, + const ValueRef out) { + std::vector in_value_refs; + + { + const ValueListPtr tensors = graph.get_value_list(tensors_ref); + + VK_CHECK_COND( + tensors->size() <= 3, + "Currently only concatenation of <= 3 tensors is supported"); + + for (const ValueRef in : *tensors) { + in_value_refs.push_back(in); + } + } + + const int64_t dim = graph.extract_scalar(dim_ref); + + const int64_t ndim = graph.dim_of(in_value_refs.at(0)); + int64_t normalized_dim = dim; + if (normalized_dim < 0) { + normalized_dim += ndim; + } + + const int64_t dim_whcn = nchw_dim_to_whcn_dim(normalized_dim, ndim); + const ValueRef dim_whcn_ref = graph.get_or_add_value_for_int(dim_whcn); + + vkapi::ParamsBindList param_buffers = { + graph.get_or_create_int_param_buffer(dim_whcn_ref, 0)}; + + std::vector push_constants; + vkapi::SpecVarList spec_vars; + + if (graph.is_buffer_storage(out)) { + param_buffers.append(graph.sizes_ubo(out)); + param_buffers.append(graph.strides_ubo(out)); + + for (const ValueRef in_ref : in_value_refs) { + param_buffers.append(graph.sizes_ubo(in_ref)); + param_buffers.append(graph.strides_ubo(in_ref)); + } + + param_buffers.append(graph.numel_ubo(out)); + + spec_vars = {graph.hashed_layout_of(out)}; + } else { + push_constants = {graph.sizes_pc_of(out)}; + + spec_vars = {graph.hashed_layout_of(out)}; + + for (const ValueRef in_ref : in_value_refs) { + push_constants.push_back(graph.sizes_pc_of(in_ref)); + spec_vars.append(graph.hashed_layout_of(in_ref)); + } + } + + std::string kernel_name = "concat"; + if (in_value_refs.size() == 1) { + kernel_name += "_1"; + } else if (in_value_refs.size() == 2) { + kernel_name += "_2"; + } else if (in_value_refs.size() == 3) { + kernel_name += "_3"; + } + if (graph.is_buffer_storage(out)) { + kernel_name += "_buffer"; + } else { + kernel_name += "_texture3d"; + } + + add_dtype_suffix(kernel_name, graph.dtype_of(out)); + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + default_pick_global_wg_size, + default_pick_local_wg_size, + // Inputs and Outputs + {{out, vkapi::kWrite}, {in_value_refs, vkapi::kRead}}, + // Parameter buffers + param_buffers, + // Push Constants + push_constants, + // Specialization Constants + spec_vars, + // Resize Args + {dim_ref}, + // Resizing Logic + resize_concat_node)); +} + +void cat_tensor(ComputeGraph& graph, const std::vector& args) { + // Extract arguments + const ValueRef tensors_ref = args.at(0); + const ValueRef dim_ref = args.at(1); + const ValueRef out = args.at(2); + + // Add concat node + add_concat_node(graph, tensors_ref, dim_ref, out); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(aten.cat.default, cat_tensor); +} + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp index 6e101195e3f..07502a7a107 100644 --- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp @@ -43,6 +43,10 @@ void check_linear_qcsnw_args( VK_CHECK_COND( utils::val_at(-1, scales_sizes) == utils::val_at(-2, qmat2_sizes)); } + + if (graph.is_buffer_storage(out)) { + VK_CHECK_COND(graph.is_contiguous(out)); + } } void resize_linear_qcsnw_node( diff --git a/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp b/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp index 423c9789d67..7b5fad57483 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp @@ -55,7 +55,6 @@ void add_transfer_copy_node( } transfer_params{static_cast(dim_whcn)}; std::vector push_constants; - vkapi::SpecVarList spec_vars; if (graph.is_buffer_storage(out)) { push_constants = { @@ -64,23 +63,18 @@ void add_transfer_copy_node( graph.strides_pc_of(in), graph.numel_pc_of(out), PushConstantDataInfo(&transfer_params, sizeof(transfer_params))}; - - spec_vars = { - graph.packed_dim_of(out), - graph.packed_dim_of(in), - }; } else { push_constants = { graph.sizes_pc_of(out), graph.sizes_pc_of(in), PushConstantDataInfo(&transfer_params, sizeof(transfer_params))}; - - spec_vars = { - graph.hashed_layout_of(out), - graph.hashed_layout_of(in), - }; } + vkapi::SpecVarList spec_vars = { + graph.hashed_layout_of(out), + graph.hashed_layout_of(in), + }; + // Determine the shader directly std::string kernel_name; if (transfer_type == TransferType::SELECT) { diff --git a/backends/vulkan/runtime/graph/ops/impl/Where.cpp b/backends/vulkan/runtime/graph/ops/impl/Where.cpp index a3be34830d3..ea610b1fe74 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Where.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Where.cpp @@ -54,7 +54,7 @@ void add_where_texture_node( // Push Constants {}, // Specialization Constants - {graph.packed_dim_of(out)}, + {graph.hashed_layout_of(out)}, // Resize Arguments {}, // Resizing Logic @@ -96,10 +96,7 @@ void add_where_buffer_node( // Push Constants {}, // Specialization Constants - {graph.packed_dim_of(out), - graph.packed_dim_of(cond), - graph.packed_dim_of(self), - graph.packed_dim_of(other)}, + {graph.hashed_layout_of(out)}, // Resize Arguments {}, // Resizing Logic diff --git a/backends/vulkan/runtime/vk_api/Descriptor.cpp b/backends/vulkan/runtime/vk_api/Descriptor.cpp index 938666802ef..9e8394ffa9c 100644 --- a/backends/vulkan/runtime/vk_api/Descriptor.cpp +++ b/backends/vulkan/runtime/vk_api/Descriptor.cpp @@ -32,8 +32,8 @@ BufferBindInfo::BufferBindInfo( BufferBindInfo::BufferBindInfo( const VulkanBuffer& buffer_p, - const uint32_t offset_p, - const uint32_t range_p) + const size_t offset_p, + const size_t range_p) : handle(buffer_p.handle()), offset(buffer_p.mem_offset() + offset_p), range(range_p) { diff --git a/backends/vulkan/runtime/vk_api/Descriptor.h b/backends/vulkan/runtime/vk_api/Descriptor.h index 60d66a22619..15ea5e23e33 100644 --- a/backends/vulkan/runtime/vk_api/Descriptor.h +++ b/backends/vulkan/runtime/vk_api/Descriptor.h @@ -36,8 +36,8 @@ struct BufferBindInfo final { BufferBindInfo(const VulkanBuffer& buffer_p, const uint32_t offset_p = 0u); BufferBindInfo( const VulkanBuffer& buffer_p, - const uint32_t offset_p, - const uint32_t range_p); + const size_t offset_p, + const size_t range_p); }; struct ParamsBindList final { diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py index bd67933dc93..813807445f0 100644 --- a/backends/vulkan/test/op_tests/cases.py +++ b/backends/vulkan/test/op_tests/cases.py @@ -52,13 +52,17 @@ def get_binary_elementwise_inputs(): ((S, S1, S2), (S, S1, 1), 2.0), ((S, S1, S2), (S, 1, S2), 2.0), ((XS, S, S1, S2), (XS, S, 1, 1), 2.0), + ((3, 64, 1), (1, 64, 1)), ] ) test_suite.layouts = [ "utils::kWidthPacked", "utils::kChannelsPacked", ] - test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"] + test_suite.storage_types = [ + "utils::kBuffer", + "utils::kTexture3D", + ] return test_suite @@ -1192,9 +1196,12 @@ def get_cat_inputs(): ) test_suite.layouts = [ "utils::kWidthPacked", - "utils::kHeightPacked", "utils::kChannelsPacked", ] + test_suite.storage_types = [ + "utils::kTexture3D", + "utils::kBuffer", + ] test_suite.data_gen = "make_seq_tensor" test_suite.dtypes = ["at::kFloat"] return test_suite diff --git a/backends/vulkan/test/op_tests/utils/gen_correctness_vk.py b/backends/vulkan/test/op_tests/utils/gen_correctness_vk.py index ce6ab32ce60..4f0d2ff11ef 100644 --- a/backends/vulkan/test/op_tests/utils/gen_correctness_vk.py +++ b/backends/vulkan/test/op_tests/utils/gen_correctness_vk.py @@ -29,6 +29,7 @@ class GeneratedOpsTest_{op_name} : public ::testing::TestWithParam< ::std::tuple void SetUp() override {{ GraphConfig config; + config.expect_dynamic_shapes = true; utils::StorageType default_storage_type; utils::GPUMemoryLayout default_memory_layout; std::tie(test_dtype, default_storage_type, default_memory_layout) = GetParam(); @@ -119,7 +120,7 @@ def gen_parameterization(self) -> str: return vkapi::kInt; case c10::kChar: return vkapi::kChar; - case c10::kBool: + case c10::kBool: return vkapi::kBool; default: VK_THROW("Unsupported at::ScalarType!"); diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py index dfd22198363..0096834f3c6 100644 --- a/backends/vulkan/test/test_vulkan_delegate.py +++ b/backends/vulkan/test/test_vulkan_delegate.py @@ -733,6 +733,10 @@ def forward(self, x): self.lower_module_and_test_output(model, sample_inputs) + @unittest.skip( + "Currently this test is failing due to weird partitioning because the eq scalar" + "operator is not supported yet. Re-enable when the operator is supported." + ) def test_vulkan_backend_partial_dynamic_shapes(self): class SimpleModel(torch.nn.Module): def __init__(self): @@ -1286,14 +1290,13 @@ class TestModule(torch.nn.Module): def __init__(self): super().__init__() - def forward(self, x, y, z, w): - return torch.cat([x, y, z, w], dim=1) + def forward(self, x, y, z): + return torch.cat([x, y, z], dim=1) sample_inputs = ( torch.randn(size=(3, 6, 2, 7), dtype=torch.float32), torch.randn(size=(3, 1, 2, 7), dtype=torch.float32), torch.randn(size=(3, 9, 2, 7), dtype=torch.float32), - torch.randn(size=(3, 3, 2, 7), dtype=torch.float32), ) self.lower_module_and_test_output( diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp index 3f5dba9e277..faa0e7d0c47 100644 --- a/backends/vulkan/test/utils/test_utils.cpp +++ b/backends/vulkan/test/utils/test_utils.cpp @@ -26,13 +26,14 @@ void record_nchw_to_buffer_op( vkapi::VulkanBuffer& src_buffer, api::vTensor& v_dst) { vkapi::PipelineBarrier pipeline_barrier{}; + vkapi::SpecVarList specialization_constants = {v_dst.hashed_layout()}; context->submit_compute_job( get_nchw_to_tensor_shader(v_dst, true, false), pipeline_barrier, {uint32_t(v_dst.numel()), 1, 1}, {64, 1, 1}, - {}, + specialization_constants, VK_NULL_HANDLE, 0, v_dst.buffer( diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index c4ccc860bc2..17f197dfdeb 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -259,14 +259,10 @@ TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) { /*allocate_memory = */ false); ASSERT_TRUE(new_v_tensor.strides() == ref_strides); - ASSERT_TRUE( - new_v_tensor.unsqueezed_strides() == ref_unsqueezed_strides); // Resize vtensor and check that updated metadata is correct v_tensor_to_resize.virtual_reconfigure(sizes, dim_order); ASSERT_TRUE(v_tensor_to_resize.strides() == ref_strides); - ASSERT_TRUE( - v_tensor_to_resize.unsqueezed_strides() == ref_unsqueezed_strides); } } } @@ -1003,18 +999,14 @@ TEST_F(VulkanComputeAPITest, texture_virtual_resize) { b.virtual_resize(new_sizes); c.virtual_resize(new_sizes); - fill_staging( - staging_buffer_a, float(new_sizes[1] + 1.5f), a.staging_buffer_numel()); - fill_staging( - staging_buffer_b, - float(new_sizes[2] + 55.0f), - b.staging_buffer_numel()); + fill_staging(staging_buffer_a, float(new_sizes[1] + 1.5f), a.numel()); + fill_staging(staging_buffer_b, float(new_sizes[2] + 55.0f), b.numel()); submit_to_gpu(); check_staging_buffer( staging_buffer_c, float(new_sizes[1] + new_sizes[2] + 56.5f), - c.staging_buffer_numel()); + c.numel()); } } @@ -1096,7 +1088,6 @@ TEST_F(VulkanComputeAPITest, test_tensor_creation_from_vulkan_image) { const auto exp_numel = w * h * d * 4; EXPECT_TRUE(tensor.numel() == exp_numel); - EXPECT_TRUE(tensor.padded_numel() == exp_numel); } TEST(VulkanComputeGraphTest, test_values_scalars) {