Skip to content

Commit b43aa88

Browse files
committed
Update on "[ET-VK] Used hashed layout instead of axis map UBO"
## Context #6358 showed that passing in the axis map of a tensor via a specialization constant allows shaders to utilize the axis map in indexing calculations with minimal impact to latency. This diff extends that idea, and introduces the concept of a hashed layout. The hashed layout is a 32 bit integer where: 1. Bits 28-31: `axis_map[0]` 2. Bits 24-27: `axis_map[1]` 3. Bits 20-23: `axis_map[2]` 4. Bits 16-19: `axis_map[3]` 5. Bits 12-15: `packed_dim` 6. Bits 0-11: unused Essentially, the integer is divided into chunks of 4 bits, and each chunk is used to represent a value from the `axis_map` + `packed_dim`. This way, the entire description of how the tensor is represented as a texture can be passed into a compute shader with a single specialization constant. Within the compute shader, the axis map and packed dim can be extracted like so: ``` ${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); const lowp int in_packed_dim = unhash_packed_dim(in_layout); ``` Note that `lowp` can be used because the expected values are limited by the dimensionality of the tensor, therefore we expect only small values. ## Changes 1. Introduce `hashed_layout` 2. Replace all uses of `axis_map_ubo` with `hashed_layout` 3. Remove `axis_map_ubo` from `vTensor. This also reduces the size of the class. Differential Revision: [D65085141](https://our.internmc.facebook.com/intern/diff/D65085141/) [ghstack-poisoned]
1 parent cc3cd83 commit b43aa88

File tree

4 files changed

+12
-17
lines changed

4 files changed

+12
-17
lines changed

backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.glsl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,13 @@ layout(std430) buffer;
2121
${layout_declare_buffer(B, "w", "nchw_out", "int")}
2222
${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
2323
${layout_declare_ubo(B, "ivec4", "tensor_sizes")}
24-
${layout_declare_ubo(B, "ivec4", "axis_map")}
2524
${layout_declare_ubo(B, "int", "out_numel")}
2625

2726
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
2827

29-
layout(constant_id = 3) const int packed_dim = C_DIM;
28+
${layout_declare_spec_const(C, "int", "t_layout", "DEFAULT_LAYOUT")}
29+
const lowp ivec4 axis_map = unhash_axis_map(t_layout);
30+
const lowp int packed_dim = unhash_packed_dim(t_layout);
3031

3132
void main() {
3233
const int out_buf_idx = int(gl_GlobalInvocationID.x);

backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,12 @@ layout(std430) buffer;
2323
${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
2424
${layout_declare_buffer(B, "r", "nchw_in", "int")}
2525
${layout_declare_ubo(B, "ivec4", "sizes")}
26-
${layout_declare_ubo(B, "ivec4", "axis_map")}
2726

2827
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
2928

30-
layout(constant_id = 3) const int packed_dim = C_DIM;
29+
${layout_declare_spec_const(C, "int", "t_layout", "DEFAULT_LAYOUT")}
30+
const lowp ivec4 axis_map = unhash_axis_map(t_layout);
31+
const lowp int packed_dim = unhash_packed_dim(t_layout);
3132

3233
/*
3334
* Extends sign of int8

backends/vulkan/test/utils/test_utils.cpp

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -126,13 +126,12 @@ void record_bitw8_image_to_nchw_nobitw8buffer_op(
126126
pipeline_barrier,
127127
global_wg_size,
128128
adaptive_work_group_size(global_wg_size),
129-
{v_src.packed_dim()},
129+
{v_src.hashed_layout()},
130130
VK_NULL_HANDLE,
131131
0,
132132
dst_buffer.buffer(),
133133
v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
134134
v_src.sizes_ubo(),
135-
v_src.axis_map_ubo(),
136135
v_src.numel_ubo());
137136
}
138137

@@ -335,7 +334,7 @@ void record_matmul_texture3d(
335334
pipeline_barrier,
336335
global_wg_size,
337336
{8, 8, 1},
338-
{out.packed_dim(), mat1.packed_dim(), mat2.packed_dim()},
337+
{out.hashed_layout(), mat1.hashed_layout(), mat2.hashed_layout()},
339338
VK_NULL_HANDLE,
340339
0,
341340
out.image(
@@ -346,11 +345,8 @@ void record_matmul_texture3d(
346345
mat2.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
347346
out.sizes_ubo(),
348347
out.logical_limits_ubo(),
349-
out.axis_map_ubo(),
350348
mat1.sizes_ubo(),
351-
mat1.axis_map_ubo(),
352-
mat2.sizes_ubo(),
353-
mat2.axis_map_ubo());
349+
mat2.sizes_ubo());
354350
}
355351

356352
//

backends/vulkan/test/vulkan_compute_api_test.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1602,8 +1602,7 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
16021602

16031603
// +2: alpha UBO, broadcast UBO for arithmetic shader
16041604
// +1: t.sizes_ubo() for arithmetic shader output c
1605-
// +1: t.axis_map_ubo() for arithmetic shader output c
1606-
expected_vma_allocation_count += 4;
1605+
expected_vma_allocation_count += 3;
16071606
EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
16081607

16091608
IOValueRef d = graph.add_input_tensor(
@@ -1612,9 +1611,8 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
16121611
/*shared_object_idx = */ 2);
16131612

16141613
// +1: t.sizes_ubo() uniform buffer for staging shader
1615-
// +1: t.axis_map_ubo() uniform buffer for staging shader
16161614
// +1: staging buffer for the input tensor
1617-
expected_vma_allocation_count += 3;
1615+
expected_vma_allocation_count += 2;
16181616
EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
16191617

16201618
ValueRef e = graph.add_tensor(
@@ -1627,8 +1625,7 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
16271625

16281626
// +2: alpha UBO, broadcast UBO for arithmetic shader
16291627
// +1: t.sizes_ubo() for arithmetic shader output e
1630-
// +1: t.axis_map_ubo() for arithmetic shader output e
1631-
expected_vma_allocation_count += 4;
1628+
expected_vma_allocation_count += 3;
16321629
EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
16331630

16341631
IOValueRef out = {};

0 commit comments

Comments
 (0)