Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
410 changes: 284 additions & 126 deletions backends/vulkan/runtime/api/containers/Tensor.cpp

Large diffs are not rendered by default.

162 changes: 92 additions & 70 deletions backends/vulkan/runtime/api/containers/Tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,18 @@ struct LastAccess {
: stage{stage_flags}, access{access_flags} {}
};

/*
* Calculate the number of elements that a GPU buffer would require to store the
* contents of a tensor. This will depend on the storage type and dtype of the
* tensor, as well as the features available on the device.
*/
int64_t calculate_gpu_buffer_numel(
Context* const context,
const std::vector<int64_t>& sizes,
const utils::uvec3 image_extents,
const utils::StorageType storage_type,
const vkapi::ScalarType dtype);

class vTensorStorage final {
public:
// Do not allow empty vTensorStorage construction
Expand All @@ -91,7 +103,7 @@ class vTensorStorage final {
const utils::StorageType storage_type,
const std::vector<int64_t>& axis_map,
const int32_t packed_dim,
const std::vector<int64_t>& padded_sizes,
const std::vector<int64_t>& sizes,
const vkapi::ScalarType dtype,
const bool allocate_memory = true);

Expand Down Expand Up @@ -140,6 +152,10 @@ class vTensorStorage final {
void verify() const;

public:
inline size_t buffer_len() const {
return utils::safe_downcast<size_t>(buffer_length_);
}

inline VkFormat texture_format() {
return image_.format();
}
Expand Down Expand Up @@ -207,15 +223,19 @@ class vTensor final {
vTensor(vTensor&& other) = default;
vTensor& operator=(vTensor&& other) = default;

~vTensor() = default;

enum class Attribute : uint8_t {
SIZES,
WHCN_DIM_ORDER,
STRIDES,
LOGICAL_LIMITS,
NUMEL,
};

class UniformData {
utils::ivec4 sizes_v;
utils::ivec4 whcn_dim_order_v;
utils::ivec4 strides_v;
// See the comments documenting logical_limits() for more context.
TextureLimits logical_limits;
Expand All @@ -227,10 +247,12 @@ class vTensor final {

UniformData(
const std::vector<int64_t>& sizes,
const std::vector<int64_t>& whcn_dim_order,
const std::vector<int64_t>& strides,
const TextureLimits& logical_limits,
const size_t numel_ll)
: sizes_v(utils::make_whcn_ivec4(sizes)),
whcn_dim_order_v(utils::make_ivec4(whcn_dim_order)),
strides_v(utils::make_whcn_ivec4(strides)),
logical_limits(logical_limits),
numel(utils::safe_downcast<int32_t>(numel_ll)) {}
Expand Down Expand Up @@ -293,21 +315,17 @@ class vTensor final {
// strides of the tensor in NCHW dimension order
std::vector<int64_t> strides_;

/*
* The below metadata members are derived from the above, and are typically
* to i.e. pass tensor metadata to compute shaders.
*/
// number of elements based on the canonical sizes
size_t numel_;

// For texture backed tensors, this int32 contains the axis map data packed
// into a single int32. For buffer backed tensors, this int32 contains the
// wchn dim order data packed into a single int32.
int32_t hashed_layout_;

// padded sizes of the tensor in NCHW dimension order. See the
// calculate_padded_sizes() function for more context. Note that padded sizes
// are only used for texture storage, and not for buffer storage.
std::vector<int64_t> padded_sizes_;
// Contains the strides of the tensor, with the dimensionality padded to the
// nearest multiple of 4. Unsqueezed dims will have a stride of int32_t max.
std::vector<int64_t> unsqueezed_strides_;
// Contains the number of elements in the tensor according to the padded
// sizes.
size_t padded_numel_;
// Pre-compute these quantities to avoid frequent re-computation
size_t nbytes_per_ubo_;
size_t max_ubo_nbytes_;

/*
* Utility GPU buffer that can be passed to shaders in order to convey tensor
Expand All @@ -320,15 +338,13 @@ class vTensor final {
* context about the data contained in each buffer.
*/
ParamsBuffer uniforms_;
uint32_t uniforms_size_;
uint32_t sizes_uniform_offset_;
uint32_t unsqueezed_strides_offset_;
uint32_t numel_uniform_offset_;
uint32_t logical_limits_uniform_offset_;

// Maximum number of metadata fields that can be stored in the metadata UBO.
// This is used to calculate the size of the UBO that should be allocated.
constexpr static size_t kMaxMetadataFieldCount = 4;
uint32_t uniforms_size_ = 0u;
uint32_t sizes_uniform_offset_ = kUniformOffsetUnset;
uint32_t dim_order_uniform_offset_ = kUniformOffsetUnset;
uint32_t strides_uniform_offset = kUniformOffsetUnset;
uint32_t numel_uniform_offset_ = kUniformOffsetUnset;
uint32_t logical_limits_uniform_offset_ = kUniformOffsetUnset;

// Initial value of uniform buffer offsets. 1 is selected as it is essentially
// impossible for a ubo to have an offset of 1.
Expand Down Expand Up @@ -381,9 +397,6 @@ class vTensor final {
return storage_->storage_type_ == utils::kBuffer;
}

private:
void set_logical_limits(const utils::uvec3& image_extents);

public:
/*
* The logical limits of the tensor are derived from the image extents of the
Expand Down Expand Up @@ -451,21 +464,37 @@ class vTensor final {
return dim_order_;
}

inline const std::vector<int64_t>& strides() const {
return strides_;
}

inline size_t numel() const {
return numel_;
}

inline size_t nbytes() const {
return element_size(dtype()) * numel();
}

inline const std::vector<int64_t>& axis_map() const {
return axis_map_;
}

/*
* Returns a single int32_t that contains the values of the axis map and the
* packed dimension packed into a single int32_t, such that it can be used as
* a specialization constant in a compute shader. This allows for the SPIR-V
* to bytecode compilation to perform compile-time unfolding on the axis map.
* Each element of the axis map and the value of the packed dimension take up
* 4 bits in the packed int32_t.
* For texture backed tensors, this function return a int32_t that contains
* the axis map + packed dimension. Each element of the axis map occupies 4
* bits of the int32.
*
* For buffer backed tensors, the int32_t contains the WHCN dim order, where
* each element of the dim order array occupies 4 bits of the int32.
*
* This int32 is typically consumed as a specialization constant in compute
* shaders where it is subsequently unpacked. The layout data of a vTensor
* instance is typically static once created, which is why this method is
* appropriate.
*/
inline int32_t hashed_layout() const {
return axis_map_.at(0) + (axis_map_.at(1) << 4) + (axis_map_.at(2) << 8) +
(axis_map_.at(3) << 12) + (packed_dim_ << 16);
return hashed_layout_;
}

/*
Expand All @@ -478,57 +507,48 @@ class vTensor final {
return axis_map_.at(0) == 0 && axis_map_.at(1) == 1 && axis_map_.at(2) == 2;
}

inline const std::vector<int64_t>& strides() const {
return strides_;
}
/*
* Return true if a buffer backed tensor's dim order matches that of a
* contiguous tensor, i.e. the dim order will be {0, 1, 2, ... }.
* Returns false for texture backed tensors.
*/
bool is_contiguous() const;

inline const std::vector<int64_t>& unsqueezed_strides() const {
return unsqueezed_strides_;
private:
inline size_t nbytes_per_ubo() const {
return storage_->context_->adapter_ptr()->min_ubo_alignment();
}

size_t get_max_ubo_nbytes(const size_t nbytes_per_ubo) const;

public:
/*
* Returns a GPU buffer containing the sizes of the tensor in WHCN order.
* Note that dimensions that are not present in the tensor's sizes are set to
* a size of 1.
* The functions below return the buffer binding info for a UBO that contains
* some metadata of the tensor, which can be used to pass in tensor metadata
* to a compute shader. The other method of passing in tensor metadata is via
* push constants. The trade-off between each is that push constants may be
* slightly more performant and memory efficient; however, to update the
* values in a push constant due to i.e. a tensor resize between inferences,
* the command buffer must be re-encoded. On the other hand, UBOs can update
* their data by writing to their mapped memory without requiring a command
* buffer re-encode.
*/

const vkapi::BufferBindInfo sizes_ubo();

/*
* Returns a GPU buffer containing the strides of the tensor in WHCN order.
* Note that the strides are extended to a dimensionality that is a multiple
* of 4, thus dimensions that are not present in the tensor's sizes are set to
* have a stride equal to the stride of the "slowest moving" dimension.
*/
const vkapi::BufferBindInfo dim_order_ubo();

const vkapi::BufferBindInfo strides_ubo();

/*
* Returns a GPU buffer containing the logical limits of the tensor. See the
* comments for logical_limits() for more context.
*/
const vkapi::BufferBindInfo logical_limits_ubo();

/*
* Returns the number of elements in the buffer used to store the tensor.
*/
const vkapi::BufferBindInfo numel_ubo();

inline size_t numel() const {
return uniform_data_->numel;
}

inline size_t nbytes() const {
return element_size(dtype()) * numel();
}

/*
* Returns numel but based on padded_sizes_ instead of sizes_
*/
inline size_t padded_numel() const {
return padded_numel_;
public:
inline size_t staging_buffer_numel() const {
return storage_->buffer_len();
}

size_t staging_buffer_numel() const;

inline size_t staging_buffer_nbytes() const {
return element_size(dtype()) * staging_buffer_numel();
}
Expand Down Expand Up @@ -608,6 +628,8 @@ class vTensor final {
};

static constexpr vTensor::Attribute kTensorSizes = vTensor::Attribute::SIZES;
static constexpr vTensor::Attribute kTensorDimOrder =
vTensor::Attribute::WHCN_DIM_ORDER;
static constexpr vTensor::Attribute kTensorStrides =
vTensor::Attribute::STRIDES;
static constexpr vTensor::Attribute kTensorLogicalLimits =
Expand Down
14 changes: 14 additions & 0 deletions backends/vulkan/runtime/graph/ComputeGraph.h
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,10 @@ class ComputeGraph final {
return values_.at(idx).toTensor().strides_ubo();
}

inline vkapi::BufferBindInfo dim_order_ubo(const ValueRef idx) {
return values_.at(idx).toTensor().dim_order_ubo();
}

inline vkapi::BufferBindInfo numel_ubo(const ValueRef idx) {
return values_.at(idx).toTensor().numel_ubo();
}
Expand All @@ -354,6 +358,10 @@ class ComputeGraph final {
return values_.at(idx).toTensor().has_standard_axis_map();
}

inline bool is_contiguous(const ValueRef idx) const {
return values_.at(idx).toTensor().is_contiguous();
}

inline vkapi::BufferBindInfo logical_limits_ubo(const ValueRef idx) {
return values_.at(idx).toTensor().logical_limits_ubo();
}
Expand All @@ -363,6 +371,12 @@ class ComputeGraph final {
values_.at(idx).toConstTensor().get_uniform_data(), api::kTensorSizes);
}

inline PushConstantDataInfo dim_order_pc_of(const ValueRef idx) const {
return PushConstantDataInfo(
values_.at(idx).toConstTensor().get_uniform_data(),
api::kTensorDimOrder);
}

inline PushConstantDataInfo strides_pc_of(const ValueRef idx) const {
return PushConstantDataInfo(
values_.at(idx).toConstTensor().get_uniform_data(),
Expand Down
13 changes: 6 additions & 7 deletions backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -48,19 +48,18 @@ $else:

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
${layout_declare_spec_const(C, "int", "other_layout", "DEFAULT_LAYOUT")}

$if STORAGE == "buffer":
${layout_declare_spec_const(C, "int", "out_packed_dim", "DEFAULT_LAYOUT")}
${layout_declare_spec_const(C, "int", "in_packed_dim", "DEFAULT_LAYOUT")}
${layout_declare_spec_const(C, "int", "other_packed_dim", "DEFAULT_LAYOUT")}
const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
$else:
${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
const lowp int packed_dim = unhash_packed_dim(out_layout);

${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);

${layout_declare_spec_const(C, "int", "other_layout", "DEFAULT_LAYOUT")}
const lowp ivec4 other_axis_map = unhash_axis_map(other_layout);

#ifdef USING_BUFFER
Expand All @@ -77,7 +76,7 @@ void main() {
return;
}

const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim);
const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
const ivec4 in_tidx = min(out_tidx, in_sizes - 1);
const ivec4 other_tidx = min(out_tidx, other_sizes - 1);

Expand Down
Loading
Loading