14
14
namespace vkcompute {
15
15
namespace api {
16
16
17
+ /*
18
+ * Used to infer the sizes of a tensor that would correspond to a given
19
+ * VulkanImage.
20
+ */
17
21
std::vector<int64_t > calculate_sizes (
18
22
const vkapi::VulkanImage& image,
19
23
const utils::GPUMemoryLayout memory_layout) {
@@ -143,58 +147,19 @@ bool dim_order_is_valid(const std::vector<int64_t>& dim_order) {
143
147
return sum == n * (n + 1 ) / 2 ;
144
148
}
145
149
146
- /*
147
- * Applies the following transformations to a tensor's dim_order vector:
148
- * 1. Reverse the order of elements so that the fastest moving dimensions are
149
- * first.
150
- * 2. Convert NCHW dimension indices to WHCN indices, so that 0 represents the
151
- * width dimension, 1 represents the height dimension, and 2 represents the
152
- * channels dimension.
153
- * 3. Unsqueeze the dim_order vector to the next multiple of 4.
154
-
155
- * These transformations make it easier to use the dim order in a compute shader
156
- */
157
- std::vector<int64_t > create_whcn_dim_order (
158
- const std::vector<int64_t >& dim_order) {
159
- size_t ndim = dim_order.size ();
160
- std::vector<int64_t > whcn_order (ndim);
161
-
162
- // Convert from NCHW to WHCN index, and flip the dim order so that the fastest
163
- // moving dimension is first.
164
- // example: { 1, 2, 0} -> { 2, 0, 1}
165
- // {height, width, channels} -> {channels, width, height}
166
- for (size_t whcn_i = 0 , nchw_i = (ndim - 1 ); whcn_i < ndim;
167
- ++whcn_i, --nchw_i) {
168
- whcn_order.at (whcn_i) = ndim - 1 - dim_order.at (nchw_i);
169
- }
170
-
171
- // Unsqueeze to the next multiple of 4
172
- size_t ndim_up4 = utils::align_up_4 (ndim);
173
- whcn_order.resize (ndim_up4);
174
-
175
- // Append unsqueezed dimensions
176
- for (size_t i = ndim; i < ndim_up4; ++i) {
177
- whcn_order.at (i) = i;
178
- }
179
-
180
- return whcn_order;
181
- }
182
-
183
- std::vector<int64_t > unsqueeze_strides (
184
- const std::vector<int64_t >& strides,
185
- const int64_t numel) {
186
- const size_t ndim = strides.size ();
187
- const size_t ndim_up4 = utils::align_up_4 (strides.size ());
188
- std::vector<int64_t > unsqueezed_strides (ndim_up4);
189
- for (int32_t i = 1 ; i <= ndim; ++i) {
190
- int64_t dim_stride = strides.at (ndim - i);
191
- unsqueezed_strides.at (ndim_up4 - i) = dim_stride;
192
- }
193
-
194
- for (int32_t i = ndim + 1 ; i <= ndim_up4; ++i) {
195
- unsqueezed_strides.at (ndim_up4 - i) = numel;
196
- }
197
- return unsqueezed_strides;
150
+ utils::ivec4 flip_and_unsqueeze_ivec4 (
151
+ const std::vector<int64_t >& tensor_metadata,
152
+ const vTensor::Attribute metadata_type,
153
+ const size_t numel) {
154
+ VK_CHECK_COND (tensor_metadata.size () <= 4 );
155
+ std::vector<int32_t > flipped_metadata =
156
+ flip_and_unsqueeze<int32_t >(tensor_metadata, metadata_type, numel);
157
+ return {
158
+ flipped_metadata.at (0 ),
159
+ flipped_metadata.at (1 ),
160
+ flipped_metadata.at (2 ),
161
+ flipped_metadata.at (3 ),
162
+ };
198
163
}
199
164
200
165
std::vector<int64_t > calculate_padded_sizes (
@@ -309,7 +274,8 @@ int64_t calculate_gpu_buffer_numel(
309
274
return numel;
310
275
}
311
276
312
- int32_t pack_into_int32 (const std::vector<int64_t >& vec, const int32_t extra) {
277
+ template <typename T, typename = std::enable_if_t <std::is_integral<T>::value>>
278
+ int32_t pack_into_int32 (const std::vector<T>& vec, const int32_t extra) {
313
279
int32_t packed = static_cast <int32_t >(
314
280
vec.at (0 ) + (vec.at (1 ) << 4 ) + (vec.at (2 ) << 8 ) + (vec.at (3 ) << 12 ) +
315
281
(extra << 16 ));
@@ -322,22 +288,24 @@ int32_t create_hashed_layout(
322
288
const int32_t packed_dim,
323
289
const utils::StorageType storage_type) {
324
290
if (storage_type == utils::kBuffer ) {
325
- return pack_into_int32 (create_whcn_dim_order (dim_order), 0 );
291
+ return pack_into_int32 (
292
+ flip_and_unsqueeze<int64_t >(dim_order, kTensorDimOrder , 0 ), 0 );
326
293
}
327
294
return pack_into_int32 (axis_map, packed_dim);
328
295
}
329
296
330
297
size_t calculate_max_ubo_nbytes (
331
- const size_t nbytes_per_ubo ,
298
+ const size_t min_nbytes_per_ubo ,
332
299
const utils::StorageType storage_type) {
333
- // For texture backed tensors, the metadata fields needed are:
334
- // sizes, logical limits
335
- size_t max_metadata_field_count = 2u ;
300
+ size_t ivec4_ubo_nbytes = utils::align_up ( size_t ( 16 ), min_nbytes_per_ubo);
301
+ size_t uvec3_ubo_nbytes = utils::align_up ( size_t ( 12 ), min_nbytes_per_ubo);
302
+ size_t int32_ubo_nbytes = utils::align_up ( size_t ( 4 ), min_nbytes_per_ubo) ;
336
303
if (storage_type == utils::kBuffer ) {
337
304
// sizes, strides, dim order, numel
338
- max_metadata_field_count = 4u ;
305
+ return 3 * ivec4_ubo_nbytes + int32_ubo_nbytes ;
339
306
}
340
- return max_metadata_field_count * nbytes_per_ubo;
307
+ // sizes, logical limits
308
+ return ivec4_ubo_nbytes + uvec3_ubo_nbytes;
341
309
}
342
310
343
311
//
@@ -595,8 +563,9 @@ vTensor::vTensor(
595
563
packed_dim_,
596
564
storage_type)),
597
565
// Related to tensor metadata UBOs
598
- nbytes_per_ubo_{context->adapter_ptr ()->min_ubo_alignment ()},
599
- max_ubo_nbytes_{calculate_max_ubo_nbytes (nbytes_per_ubo_, storage_type)},
566
+ min_nbytes_per_ubo_{context->adapter_ptr ()->min_ubo_alignment ()},
567
+ max_ubo_nbytes_{
568
+ calculate_max_ubo_nbytes (min_nbytes_per_ubo_, storage_type)},
600
569
uniforms_ (),
601
570
// Construct Tensor storage
602
571
storage_(std::make_shared<vTensorStorage>(
@@ -607,23 +576,13 @@ vTensor::vTensor(
607
576
sizes,
608
577
dtype_,
609
578
allocate_memory)) {
610
- // Derived metadata
611
- std::vector<int64_t > whcn_dim_order (4 , 0 );
612
- std::vector<int64_t > unsqueezed_strides (4 , 0 );
613
- // Only calculate derived metadata if needed for the desired storage type.
614
- // Note that logical limits may be used by buffer storage as well in order to
615
- // set global work group sizes for some compute shaders.
616
- if (storage_type == utils::kBuffer ) {
617
- whcn_dim_order = create_whcn_dim_order (dim_order_);
618
- unsqueezed_strides = unsqueeze_strides (strides_, numel_);
619
- }
620
-
621
579
uniform_data_ = std::make_shared<UniformData>(UniformData{
580
+ numel_,
622
581
sizes_,
623
- whcn_dim_order ,
624
- unsqueezed_strides ,
625
- calculate_logical_limits (storage_->image_extents_ , axis_map_),
626
- numel_});
582
+ dim_order_ ,
583
+ strides_ ,
584
+ calculate_logical_limits (storage_->image_extents_ , axis_map_)});
585
+
627
586
VK_CHECK_COND (
628
587
dim_order_is_valid (dim_order_), " computed dim order is invalid" );
629
588
}
@@ -648,18 +607,18 @@ vTensor::vTensor(
648
607
packed_dim_,
649
608
utils::kTexture3D )),
650
609
// Related to tensor metadata UBOs
651
- nbytes_per_ubo_ {context->adapter_ptr ()->min_ubo_alignment ()},
610
+ min_nbytes_per_ubo_ {context->adapter_ptr ()->min_ubo_alignment ()},
652
611
max_ubo_nbytes_{
653
- calculate_max_ubo_nbytes (nbytes_per_ubo_ , utils::kTexture3D )},
612
+ calculate_max_ubo_nbytes (min_nbytes_per_ubo_ , utils::kTexture3D )},
654
613
uniforms_ (),
655
614
// Construct Tensor storage
656
615
storage_(std::make_shared<vTensorStorage>(context, image)) {
657
616
uniform_data_ = std::make_shared<UniformData>(UniformData{
617
+ numel_,
658
618
sizes_,
659
619
{0 , 0 , 0 , 0 },
660
620
{0 , 0 , 0 , 0 },
661
- calculate_logical_limits (storage_->image_extents_ , axis_map_),
662
- numel_});
621
+ calculate_logical_limits (storage_->image_extents_ , axis_map_)});
663
622
}
664
623
665
624
vTensor::vTensor (vTensor& other)
@@ -672,7 +631,7 @@ vTensor::vTensor(vTensor& other)
672
631
strides_(other.strides_.begin(), other.strides_.end()),
673
632
numel_(other.numel_),
674
633
hashed_layout_(other.hashed_layout_),
675
- nbytes_per_ubo_ {other.nbytes_per_ubo_ },
634
+ min_nbytes_per_ubo_ {other.min_nbytes_per_ubo_ },
676
635
max_ubo_nbytes_{other.max_ubo_nbytes_ },
677
636
uniforms_ (),
678
637
// Copy Tensor storage
@@ -697,22 +656,35 @@ vTensor::vTensor(
697
656
axis_map_,
698
657
packed_dim_,
699
658
other.storage_type())),
700
- nbytes_per_ubo_ {other.nbytes_per_ubo_ },
659
+ min_nbytes_per_ubo_ {other.min_nbytes_per_ubo_ },
701
660
max_ubo_nbytes_{other.max_ubo_nbytes_ },
702
661
uniforms_ (),
703
662
// Copy Tensor storage
704
663
storage_(other.storage_) {
705
664
uniform_data_ = std::make_shared<UniformData>(UniformData{
665
+ static_cast <size_t >(utils::multiply_integers (sizes_)),
706
666
sizes_,
707
- create_whcn_dim_order (dim_order_),
708
- unsqueeze_strides (strides_, numel_),
709
- other.logical_limits (),
710
- static_cast <size_t >(utils::multiply_integers (sizes_))});
667
+ dim_order_,
668
+ strides_,
669
+ other.logical_limits ()});
711
670
712
671
VK_CHECK_COND (
713
672
dim_order_is_valid (dim_order_), " new dim order provided is invalid" );
714
673
}
715
674
675
+ vTensor::UniformData::UniformData (
676
+ const size_t numel_ll,
677
+ const std::vector<int64_t >& sizes,
678
+ const std::vector<int64_t >& dim_order,
679
+ const std::vector<int64_t >& strides,
680
+ const utils::uvec3& limits)
681
+ : numel(utils::safe_downcast<int32_t >(numel_ll)),
682
+ sizes_v(flip_and_unsqueeze_ivec4(sizes, kTensorSizes , numel_ll)),
683
+ dim_order_v(
684
+ flip_and_unsqueeze_ivec4 (dim_order, kTensorDimOrder , numel_ll)),
685
+ strides_v(flip_and_unsqueeze_ivec4(strides, kTensorStrides , numel_ll)),
686
+ logical_limits(limits) {}
687
+
716
688
uint32_t vTensor::UniformData::write_attribute (
717
689
void * dst,
718
690
const uint32_t dst_offset,
@@ -727,11 +699,11 @@ uint32_t vTensor::UniformData::write_attribute(
727
699
return sizeof (member_name); \
728
700
}
729
701
switch (attr) {
702
+ WRITE_ATTRIBUTE_CASE (NUMEL, numel);
730
703
WRITE_ATTRIBUTE_CASE (SIZES, sizes_v);
731
- WRITE_ATTRIBUTE_CASE (WHCN_DIM_ORDER, whcn_dim_order_v );
704
+ WRITE_ATTRIBUTE_CASE (WHCN_DIM_ORDER, dim_order_v );
732
705
WRITE_ATTRIBUTE_CASE (STRIDES, strides_v);
733
706
WRITE_ATTRIBUTE_CASE (LOGICAL_LIMITS, logical_limits);
734
- WRITE_ATTRIBUTE_CASE (NUMEL, numel);
735
707
default :
736
708
VK_THROW (" Invalid Attribute" );
737
709
}
@@ -806,84 +778,25 @@ size_t vTensor::get_max_ubo_nbytes(const size_t nbytes_per_ubo) const {
806
778
}
807
779
808
780
const vkapi::BufferBindInfo vTensor::sizes_ubo () {
809
- if (!uniforms_.buffer ()) {
810
- uniforms_ = ParamsBuffer (storage_->context_ , max_ubo_nbytes_, true );
811
- }
812
- if (sizes_uniform_offset_ == kUniformOffsetUnset ) {
813
- VK_CHECK_COND (
814
- (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
815
- " Uniform data allocation has exceeded Tensor uniform buffer size" );
816
- sizes_uniform_offset_ = uniforms_size_;
817
- uniforms_size_ += nbytes_per_ubo_;
818
- uniforms_.update (utils::make_whcn_ivec4 (sizes_), sizes_uniform_offset_);
819
- }
820
- return vkapi::BufferBindInfo (
821
- uniforms_.buffer (), sizes_uniform_offset_, nbytes_per_ubo_);
781
+ return metadata_ubo_impl (&sizes_uniform_offset_, uniform_data_->sizes_v );
822
782
}
823
783
824
784
const vkapi::BufferBindInfo vTensor::dim_order_ubo () {
825
- if (!uniforms_.buffer ()) {
826
- uniforms_ = ParamsBuffer (storage_->context_ , max_ubo_nbytes_, true );
827
- }
828
- if (dim_order_uniform_offset_ == kUniformOffsetUnset ) {
829
- VK_CHECK_COND (
830
- (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
831
- " Uniform data allocation has exceeded Tensor uniform buffer size" );
832
- dim_order_uniform_offset_ = uniforms_size_;
833
- uniforms_size_ += nbytes_per_ubo_;
834
- uniforms_.update (
835
- uniform_data_->whcn_dim_order_v , dim_order_uniform_offset_);
836
- }
837
- return vkapi::BufferBindInfo (
838
- uniforms_.buffer (), dim_order_uniform_offset_, nbytes_per_ubo_);
785
+ return metadata_ubo_impl (
786
+ &dim_order_uniform_offset_, uniform_data_->dim_order_v );
839
787
}
840
788
841
789
const vkapi::BufferBindInfo vTensor::strides_ubo () {
842
- if (!uniforms_.buffer ()) {
843
- uniforms_ = ParamsBuffer (storage_->context_ , max_ubo_nbytes_, true );
844
- }
845
- if (strides_uniform_offset == kUniformOffsetUnset ) {
846
- VK_CHECK_COND (
847
- (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
848
- " Uniform data allocation has exceeded Tensor uniform buffer size" );
849
- strides_uniform_offset = uniforms_size_;
850
- uniforms_size_ += nbytes_per_ubo_;
851
- uniforms_.update (uniform_data_->strides_v , strides_uniform_offset);
852
- }
853
- return vkapi::BufferBindInfo (
854
- uniforms_.buffer (), strides_uniform_offset, nbytes_per_ubo_);
790
+ return metadata_ubo_impl (&strides_uniform_offset, uniform_data_->strides_v );
855
791
}
856
792
857
793
const vkapi::BufferBindInfo vTensor::logical_limits_ubo () {
858
- if (!uniforms_.buffer ()) {
859
- uniforms_ = ParamsBuffer (storage_->context_ , max_ubo_nbytes_, true );
860
- }
861
- if (logical_limits_uniform_offset_ == kUniformOffsetUnset ) {
862
- VK_CHECK_COND (
863
- (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
864
- " Uniform data allocation has exceeded Tensor uniform buffer size" );
865
- logical_limits_uniform_offset_ = uniforms_size_;
866
- uniforms_size_ += nbytes_per_ubo_;
867
- uniforms_.update (logical_limits (), logical_limits_uniform_offset_);
868
- }
869
- return vkapi::BufferBindInfo (
870
- uniforms_.buffer (), logical_limits_uniform_offset_, nbytes_per_ubo_);
794
+ return metadata_ubo_impl (
795
+ &logical_limits_uniform_offset_, uniform_data_->logical_limits );
871
796
}
872
797
873
798
const vkapi::BufferBindInfo vTensor::numel_ubo () {
874
- if (!uniforms_.buffer ()) {
875
- uniforms_ = ParamsBuffer (storage_->context_ , max_ubo_nbytes_, true );
876
- }
877
- if (numel_uniform_offset_ == kUniformOffsetUnset ) {
878
- VK_CHECK_COND (
879
- (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
880
- " Uniform data allocation has exceeded Tensor uniform buffer size" );
881
- numel_uniform_offset_ = uniforms_size_;
882
- uniforms_size_ += nbytes_per_ubo_;
883
- uniforms_.update (numel (), numel_uniform_offset_);
884
- }
885
- return vkapi::BufferBindInfo (
886
- uniforms_.buffer (), numel_uniform_offset_, nbytes_per_ubo_);
799
+ return metadata_ubo_impl (&numel_uniform_offset_, uniform_data_->numel );
887
800
}
888
801
889
802
VkMemoryRequirements vTensor::get_memory_requirements () const {
@@ -936,22 +849,21 @@ void vTensor::update_metadata() {
936
849
strides_ = calculate_strides (sizes_, dim_order_);
937
850
938
851
// Update uniform data if it has been modified
939
- uniform_data_->numel = numel_;
940
- uniform_data_->sizes_v = utils::make_whcn_ivec4 (sizes_);
941
- uniform_data_->whcn_dim_order_v =
942
- utils::make_ivec4 (create_whcn_dim_order (dim_order_));
943
- uniform_data_->strides_v =
944
- utils::make_whcn_ivec4 (unsqueeze_strides (strides_, numel_));
945
852
uniform_data_->numel = utils::safe_downcast<int32_t >(numel_);
853
+ uniform_data_->sizes_v =
854
+ flip_and_unsqueeze_ivec4 (sizes_, kTensorSizes , numel_);
855
+ uniform_data_->dim_order_v =
856
+ flip_and_unsqueeze_ivec4 (dim_order_, kTensorDimOrder , numel_);
857
+ uniform_data_->strides_v =
858
+ flip_and_unsqueeze_ivec4 (strides_, kTensorStrides , numel_);
946
859
uniform_data_->logical_limits .limits =
947
860
calculate_logical_limits (sizes_, axis_map_, packed_dim_);
948
861
949
862
if (sizes_uniform_offset_ != kUniformOffsetUnset ) {
950
863
uniforms_.update (uniform_data_->sizes_v , sizes_uniform_offset_);
951
864
}
952
865
if (dim_order_uniform_offset_ != kUniformOffsetUnset ) {
953
- uniforms_.update (
954
- uniform_data_->whcn_dim_order_v , dim_order_uniform_offset_);
866
+ uniforms_.update (uniform_data_->dim_order_v , dim_order_uniform_offset_);
955
867
}
956
868
if (strides_uniform_offset != kUniformOffsetUnset ) {
957
869
uniforms_.update (uniform_data_->strides_v , strides_uniform_offset);
0 commit comments