From 1e64bedf5ed7fb00167e87bc6d0771747c99e14f Mon Sep 17 00:00:00 2001 From: Stephen Jia Date: Mon, 19 May 2025 11:55:20 -0700 Subject: [PATCH 1/2] [ET-VK][ez] Test command buffer re-encoding on resize ## Context Add a test where `encode_execute()` is called again after resizing model inputs and propagating the new sizes. Currently, dynamic shapes are handled by simply updating the tensor metadata when sizes are updated. Compute shaders will perform the same computations with the updated tensor sizes/strides information. However, for some operators, different input sizes require different compute shaders in order to achieve maximum performance. One example of this is for matrix multiplication, where matrix-matrix multiplication typically uses a different algorithm than vector-matrix (or matrix-vector) multiplication. Therefore, for some models, it would be best to trigger a re-encoding of the command buffer upon input resize, so that different compute shaders can be selected based on the current input sizes. The actual changes for enabling shader re-selection will be introduced in the next diff. This diff simply checks that command buffer re-encoding "works as advertised". ## Changes This diff simply adds a test in `vulkan_compute_api_test` to test whether the ComputeGraph API can handle the `encode_execute` function being called multiple times. Differential Revision: [D75013781](https://our.internmc.facebook.com/intern/diff/D75013781/) [ghstack-poisoned] --- .../vulkan/runtime/graph/ComputeGraph.cpp | 10 +- backends/vulkan/test/utils/test_utils.cpp | 53 +++++++++++ backends/vulkan/test/utils/test_utils.h | 36 +++++++ .../vulkan/test/vulkan_compute_api_test.cpp | 93 +++++++++++-------- 4 files changed, 150 insertions(+), 42 deletions(-) diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp index 59fd561a2c5..1214c89e00a 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp @@ -612,6 +612,11 @@ void ComputeGraph::prepare() { if (config_.enable_querypool) { context_->initialize_querypool(); } + + for (SharedObject& shared_object : shared_objects_) { + shared_object.allocate(this); + shared_object.bind_users(this); + } } void ComputeGraph::encode_prepack() { @@ -636,11 +641,6 @@ void ComputeGraph::encode_execute() { context_->cmd_reset_querypool(); - for (SharedObject& shared_object : shared_objects_) { - shared_object.allocate(this); - shared_object.bind_users(this); - } - for (std::unique_ptr& node : execute_nodes_) { node->encode(this); } diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp index 3b6195a5c26..c4acb41b7b0 100644 --- a/backends/vulkan/test/utils/test_utils.cpp +++ b/backends/vulkan/test/utils/test_utils.cpp @@ -537,6 +537,59 @@ void execute_graph_and_check_output( } } +vkcompute::ComputeGraph build_mm_graph( + int B, + int M, + int K, + int N, + vkcompute::vkapi::ScalarType dtype, + vkcompute::utils::StorageType in_out_stype, + vkcompute::utils::GPUMemoryLayout memory_layout, + const bool prepack_mat2, + const float mat2_val) { + using namespace vkcompute; + GraphConfig config; + ComputeGraph graph(config); + + std::vector mat1_size = {M, K}; + std::vector mat2_size = {K, N}; + std::vector out_size = {M, N}; + if (B > 1) { + mat1_size.resize(3); + mat1_size = {B, M, K}; + mat2_size.resize(3); + mat2_size = {B, K, N}; + out_size.resize(3); + out_size = {B, M, N}; + } + + IOValueRef mat1 = + graph.add_input_tensor(mat1_size, dtype, in_out_stype, memory_layout); + IOValueRef mat2{}; + + CREATE_RAND_WEIGHT_TENSOR(mat2_w, mat2_size, dtype); + if (mat2_val != 0.0f) { + std::fill(data_mat2_w.begin(), data_mat2_w.end(), mat2_val); + } + + if (prepack_mat2) { + mat2.value = mat2_w; + } else { + mat2.value = + graph.add_tensor(mat2_size, dtype, in_out_stype, memory_layout); + mat2.staging = graph.set_input_tensor(mat2.value); + } + + IOValueRef out; + out.value = graph.add_tensor(out_size, dtype, in_out_stype, memory_layout); + + VK_GET_OP_FN("aten.mm.default")(graph, {mat1.value, mat2.value, out.value}); + + out.staging = graph.set_output_tensor(out.value); + + return graph; +} + bool check_close(float a, float b, float atol, float rtol) { float max = std::max(std::abs(a), std::abs(b)); float diff = std::abs(a - b); diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h index f3ee2a717a5..71d6d0bc0de 100644 --- a/backends/vulkan/test/utils/test_utils.h +++ b/backends/vulkan/test/utils/test_utils.h @@ -8,6 +8,8 @@ #pragma once +#include + #include #include @@ -16,6 +18,8 @@ #include #include +#include + #define CREATE_FLOAT_TEXTURE(sizes, allocate_memory) \ vkcompute::api::vTensor( \ vkcompute::api::context(), \ @@ -135,6 +139,22 @@ void record_matmul_texture3d( // Input & Output Utilities // +inline std::vector create_random_float_vector( + const size_t numel, + const float min = 0.0f, + const float max = 1.0f) { + std::vector result(numel); + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dis(min, max); + + for (size_t i = 0; i < numel; ++i) { + result[i] = dis(gen); + } + + return result; +} + inline void fill_staging( vkcompute::api::StagingBuffer& staging, float val, @@ -232,6 +252,22 @@ void execute_graph_and_check_output( std::vector input_vals, std::vector expected_outputs); +#define CREATE_RAND_WEIGHT_TENSOR(name, sizes, dtype) \ + std::vector data_##name = \ + create_random_float_buffer(utils::multiply_integers(sizes)); \ + ValueRef name = graph.add_tensorref(sizes, dtype, data_##name.data()); + +vkcompute::ComputeGraph build_mm_graph( + int B, + int M, + int K, + int N, + vkcompute::vkapi::ScalarType dtype, + vkcompute::utils::StorageType in_out_stype, + vkcompute::utils::GPUMemoryLayout memory_layout, + const bool prepack_mat2 = false, + const float mat2_val = 0.0f); + // // Debugging Utilities // diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 143e6704889..cf42a846db5 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -2753,43 +2753,8 @@ void test_mm( utils::StorageType storage_type, utils::GPUMemoryLayout memory_layout, bool prepack = true) { - GraphConfig config; - config.set_storage_type_override(storage_type); - ComputeGraph graph(config); - - std::vector mat1_size = {M, K}; - std::vector mat2_size = {K, N}; - std::vector out_size = {M, N}; - if (B > 1) { - mat1_size.resize(3); - mat1_size = {B, M, K}; - mat2_size.resize(3); - mat2_size = {B, K, N}; - out_size.resize(3); - out_size = {B, M, N}; - } - - IOValueRef mat2{}; - - CREATE_WEIGHT_TENSOR(mat2_w, mat2_size, dtype, 2.0f); - - // Build graph - - IOValueRef mat1 = graph.add_input_tensor(mat1_size, dtype, memory_layout); - - if (prepack) { - mat2.value = mat2_w; - } else { - mat2.value = graph.add_tensor(mat2_size, dtype, memory_layout); - mat2.staging = graph.set_input_tensor(mat2.value); - } - - IOValueRef out; - out.value = graph.add_tensor(out_size, dtype, memory_layout); - - VK_GET_OP_FN("aten.mm.default")(graph, {mat1.value, mat2.value, out.value}); - - out.staging = graph.set_output_tensor(out.value); + ComputeGraph graph = build_mm_graph( + B, M, K, N, dtype, storage_type, memory_layout, prepack, 2.0f); graph.prepare(); graph.encode_prepack(); @@ -2855,6 +2820,60 @@ TEST(VulkanComputeGraphOpsTest, mm_smoke_test) { #undef RUN_TESTS } +void test_mm_with_resize_reencode( + int B, + int M, + int K, + int N, + vkapi::ScalarType dtype, + utils::StorageType storage_type, + utils::GPUMemoryLayout memory_layout) { + ASSERT_TRUE(M > 1); + + ComputeGraph graph = build_mm_graph( + B, M, K, N, dtype, storage_type, memory_layout, false, 2.0f); + + graph.prepare(); + graph.encode_prepack(); + graph.prepack(); + graph.encode_execute(); + + for (int i = 1; i < 4; i++) { + float val_mat1 = i; + float val_mat2 = i + 1; + float val_out = K * (val_mat1 * val_mat2); + execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out}); + } + + // Switch to GEMV mode + int new_K = K / 2; + std::vector new_mat1_size = {1, new_K}; + std::vector new_mat2_size = {new_K, N}; + graph.resize_input(0, new_mat1_size); + graph.resize_input(1, new_mat2_size); + graph.propagate_resize(); + + graph.encode_execute(); + + for (int i = 1; i < 4; i++) { + float val_mat1 = i; + float val_mat2 = i + 1; + float val_out = new_K * (val_mat1 * val_mat2); + execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out}); + } +} + +TEST(VulkanComputeGraphOpsTest, test_graph_resize_reencode) { + test_mm_with_resize_reencode( + /*B = */ 1, + /*M = */ 31, + /*K = */ 127, + /*N = */ 23, + vkapi::kFloat, + utils::kTexture3D, + utils::kWidthPacked); +} + void test_max_pool2d( const std::vector& in_size, const int64_t base_val, From 92d703e12b645b8ce4408ffa903a3094c3750419 Mon Sep 17 00:00:00 2001 From: Stephen Jia Date: Mon, 19 May 2025 12:31:54 -0700 Subject: [PATCH 2/2] Update on "[ET-VK][ez] Test command buffer re-encoding on resize" ## Context Add a test where `encode_execute()` is called again after resizing model inputs and propagating the new sizes. Currently, dynamic shapes are handled by simply updating the tensor metadata when sizes are updated. Compute shaders will perform the same computations with the updated tensor sizes/strides information. However, for some operators, different input sizes require different compute shaders in order to achieve maximum performance. One example of this is for matrix multiplication, where matrix-matrix multiplication typically uses a different algorithm than vector-matrix (or matrix-vector) multiplication. Therefore, for some models, it would be best to trigger a re-encoding of the command buffer upon input resize, so that different compute shaders can be selected based on the current input sizes. The actual changes for enabling shader re-selection will be introduced in the next diff. This diff simply checks that command buffer re-encoding "works as advertised". ## Changes This diff simply adds a test in `vulkan_compute_api_test` to test whether the ComputeGraph API can handle the `encode_execute` function being called multiple times. Differential Revision: [D75013781](https://our.internmc.facebook.com/intern/diff/D75013781/) [ghstack-poisoned]