From 1e64bedf5ed7fb00167e87bc6d0771747c99e14f Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Mon, 19 May 2025 11:55:20 -0700
Subject: [PATCH 1/2] [ET-VK][ez] Test command buffer re-encoding on resize

## Context

Add a test where `encode_execute()` is called again after resizing model inputs and propagating the new sizes.

Currently, dynamic shapes are handled by simply updating the tensor metadata when sizes are updated. Compute shaders will perform the same computations with the updated tensor sizes/strides information.

However, for some operators, different input sizes require different compute shaders in order to achieve maximum performance. One example of this is for matrix multiplication, where matrix-matrix multiplication typically uses a different algorithm than vector-matrix (or matrix-vector) multiplication.

Therefore, for some models, it would be best to trigger a re-encoding of the command buffer upon input resize, so that different compute shaders can be selected based on the current input sizes.

The actual changes for enabling shader re-selection will be introduced in the next diff. This diff simply checks that command buffer re-encoding "works as advertised".

## Changes

This diff simply adds a test in `vulkan_compute_api_test` to test whether the ComputeGraph API can handle the `encode_execute` function  being called multiple times.

Differential Revision: [D75013781](https://our.internmc.facebook.com/intern/diff/D75013781/)

[ghstack-poisoned]
---
 .../vulkan/runtime/graph/ComputeGraph.cpp     | 10 +-
 backends/vulkan/test/utils/test_utils.cpp     | 53 +++++++++++
 backends/vulkan/test/utils/test_utils.h       | 36 +++++++
 .../vulkan/test/vulkan_compute_api_test.cpp   | 93 +++++++++++--------
 4 files changed, 150 insertions(+), 42 deletions(-)
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index 59fd561a2c5..1214c89e00a 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -612,6 +612,11 @@ void ComputeGraph::prepare() {
   if (config_.enable_querypool) {
     context_->initialize_querypool();
   }
+
+  for (SharedObject& shared_object : shared_objects_) {
+    shared_object.allocate(this);
+    shared_object.bind_users(this);
+  }
 }
 
 void ComputeGraph::encode_prepack() {
@@ -636,11 +641,6 @@ void ComputeGraph::encode_execute() {
 
   context_->cmd_reset_querypool();
 
-  for (SharedObject& shared_object : shared_objects_) {
-    shared_object.allocate(this);
-    shared_object.bind_users(this);
-  }
-
   for (std::unique_ptr<ExecuteNode>& node : execute_nodes_) {
     node->encode(this);
   }
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
index 3b6195a5c26..c4acb41b7b0 100644
--- a/backends/vulkan/test/utils/test_utils.cpp
+++ b/backends/vulkan/test/utils/test_utils.cpp
@@ -537,6 +537,59 @@ void execute_graph_and_check_output(
   }
 }
 
+vkcompute::ComputeGraph build_mm_graph(
+    int B,
+    int M,
+    int K,
+    int N,
+    vkcompute::vkapi::ScalarType dtype,
+    vkcompute::utils::StorageType in_out_stype,
+    vkcompute::utils::GPUMemoryLayout memory_layout,
+    const bool prepack_mat2,
+    const float mat2_val) {
+  using namespace vkcompute;
+  GraphConfig config;
+  ComputeGraph graph(config);
+
+  std::vector<int64_t> mat1_size = {M, K};
+  std::vector<int64_t> mat2_size = {K, N};
+  std::vector<int64_t> out_size = {M, N};
+  if (B > 1) {
+    mat1_size.resize(3);
+    mat1_size = {B, M, K};
+    mat2_size.resize(3);
+    mat2_size = {B, K, N};
+    out_size.resize(3);
+    out_size = {B, M, N};
+  }
+
+  IOValueRef mat1 =
+      graph.add_input_tensor(mat1_size, dtype, in_out_stype, memory_layout);
+  IOValueRef mat2{};
+
+  CREATE_RAND_WEIGHT_TENSOR(mat2_w, mat2_size, dtype);
+  if (mat2_val != 0.0f) {
+    std::fill(data_mat2_w.begin(), data_mat2_w.end(), mat2_val);
+  }
+
+  if (prepack_mat2) {
+    mat2.value = mat2_w;
+  } else {
+    mat2.value =
+        graph.add_tensor(mat2_size, dtype, in_out_stype, memory_layout);
+    mat2.staging = graph.set_input_tensor(mat2.value);
+  }
+
+  IOValueRef out;
+  out.value = graph.add_tensor(out_size, dtype, in_out_stype, memory_layout);
+
+  VK_GET_OP_FN("aten.mm.default")(graph, {mat1.value, mat2.value, out.value});
+
+  out.staging = graph.set_output_tensor(out.value);
+
+  return graph;
+}
+
 bool check_close(float a, float b, float atol, float rtol) {
   float max = std::max(std::abs(a), std::abs(b));
   float diff = std::abs(a - b);
diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h
index f3ee2a717a5..71d6d0bc0de 100644
--- a/backends/vulkan/test/utils/test_utils.h
+++ b/backends/vulkan/test/utils/test_utils.h
@@ -8,6 +8,8 @@
 
 #pragma once
 
+#include <random>
+
 #include <gtest/gtest.h>
 
 #include <executorch/backends/vulkan/runtime/api/api.h>
@@ -16,6 +18,8 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
 #define CREATE_FLOAT_TEXTURE(sizes, allocate_memory)  \
   vkcompute::api::vTensor(                            \
       vkcompute::api::context(),                      \
@@ -135,6 +139,22 @@ void record_matmul_texture3d(
 // Input & Output Utilities
 //
 
+inline std::vector<float> create_random_float_vector(
+    const size_t numel,
+    const float min = 0.0f,
+    const float max = 1.0f) {
+  std::vector<float> result(numel);
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<float> dis(min, max);
+
+  for (size_t i = 0; i < numel; ++i) {
+    result[i] = dis(gen);
+  }
+
+  return result;
+}
+
 inline void fill_staging(
     vkcompute::api::StagingBuffer& staging,
     float val,
@@ -232,6 +252,22 @@ void execute_graph_and_check_output(
     std::vector<float> input_vals,
     std::vector<float> expected_outputs);
 
+#define CREATE_RAND_WEIGHT_TENSOR(name, sizes, dtype)              \
+  std::vector<float> data_##name =                                 \
+      create_random_float_buffer(utils::multiply_integers(sizes)); \
+  ValueRef name = graph.add_tensorref(sizes, dtype, data_##name.data());
+
+vkcompute::ComputeGraph build_mm_graph(
+    int B,
+    int M,
+    int K,
+    int N,
+    vkcompute::vkapi::ScalarType dtype,
+    vkcompute::utils::StorageType in_out_stype,
+    vkcompute::utils::GPUMemoryLayout memory_layout,
+    const bool prepack_mat2 = false,
+    const float mat2_val = 0.0f);
+
 //
 // Debugging Utilities
 //
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 143e6704889..cf42a846db5 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -2753,43 +2753,8 @@ void test_mm(
     utils::StorageType storage_type,
     utils::GPUMemoryLayout memory_layout,
     bool prepack = true) {
-  GraphConfig config;
-  config.set_storage_type_override(storage_type);
-  ComputeGraph graph(config);
-
-  std::vector<int64_t> mat1_size = {M, K};
-  std::vector<int64_t> mat2_size = {K, N};
-  std::vector<int64_t> out_size = {M, N};
-  if (B > 1) {
-    mat1_size.resize(3);
-    mat1_size = {B, M, K};
-    mat2_size.resize(3);
-    mat2_size = {B, K, N};
-    out_size.resize(3);
-    out_size = {B, M, N};
-  }
-
-  IOValueRef mat2{};
-
-  CREATE_WEIGHT_TENSOR(mat2_w, mat2_size, dtype, 2.0f);
-
-  // Build graph
-
-  IOValueRef mat1 = graph.add_input_tensor(mat1_size, dtype, memory_layout);
-
-  if (prepack) {
-    mat2.value = mat2_w;
-  } else {
-    mat2.value = graph.add_tensor(mat2_size, dtype, memory_layout);
-    mat2.staging = graph.set_input_tensor(mat2.value);
-  }
-
-  IOValueRef out;
-  out.value = graph.add_tensor(out_size, dtype, memory_layout);
-
-  VK_GET_OP_FN("aten.mm.default")(graph, {mat1.value, mat2.value, out.value});
-
-  out.staging = graph.set_output_tensor(out.value);
+  ComputeGraph graph = build_mm_graph(
+      B, M, K, N, dtype, storage_type, memory_layout, prepack, 2.0f);
 
   graph.prepare();
   graph.encode_prepack();
@@ -2855,6 +2820,60 @@ TEST(VulkanComputeGraphOpsTest, mm_smoke_test) {
 #undef RUN_TESTS
 }
 
+void test_mm_with_resize_reencode(
+    int B,
+    int M,
+    int K,
+    int N,
+    vkapi::ScalarType dtype,
+    utils::StorageType storage_type,
+    utils::GPUMemoryLayout memory_layout) {
+  ASSERT_TRUE(M > 1);
+
+  ComputeGraph graph = build_mm_graph(
+      B, M, K, N, dtype, storage_type, memory_layout, false, 2.0f);
+
+  graph.prepare();
+  graph.encode_prepack();
+  graph.prepack();
+  graph.encode_execute();
+
+  for (int i = 1; i < 4; i++) {
+    float val_mat1 = i;
+    float val_mat2 = i + 1;
+    float val_out = K * (val_mat1 * val_mat2);
+    execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out});
+  }
+
+  // Switch to GEMV mode
+  int new_K = K / 2;
+  std::vector<int64_t> new_mat1_size = {1, new_K};
+  std::vector<int64_t> new_mat2_size = {new_K, N};
+  graph.resize_input(0, new_mat1_size);
+  graph.resize_input(1, new_mat2_size);
+  graph.propagate_resize();
+
+  graph.encode_execute();
+
+  for (int i = 1; i < 4; i++) {
+    float val_mat1 = i;
+    float val_mat2 = i + 1;
+    float val_out = new_K * (val_mat1 * val_mat2);
+    execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out});
+  }
+}
+
+TEST(VulkanComputeGraphOpsTest, test_graph_resize_reencode) {
+  test_mm_with_resize_reencode(
+      /*B = */ 1,
+      /*M = */ 31,
+      /*K = */ 127,
+      /*N = */ 23,
+      vkapi::kFloat,
+      utils::kTexture3D,
+      utils::kWidthPacked);
+}
+
 void test_max_pool2d(
     const std::vector<int64_t>& in_size,
     const int64_t base_val,

From 92d703e12b645b8ce4408ffa903a3094c3750419 Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Mon, 19 May 2025 12:31:54 -0700
Subject: [PATCH 2/2] Update on "[ET-VK][ez] Test command buffer re-encoding on
 resize"

## Context

Add a test where `encode_execute()` is called again after resizing model inputs and propagating the new sizes.

Currently, dynamic shapes are handled by simply updating the tensor metadata when sizes are updated. Compute shaders will perform the same computations with the updated tensor sizes/strides information.

However, for some operators, different input sizes require different compute shaders in order to achieve maximum performance. One example of this is for matrix multiplication, where matrix-matrix multiplication typically uses a different algorithm than vector-matrix (or matrix-vector) multiplication.

Therefore, for some models, it would be best to trigger a re-encoding of the command buffer upon input resize, so that different compute shaders can be selected based on the current input sizes.

The actual changes for enabling shader re-selection will be introduced in the next diff. This diff simply checks that command buffer re-encoding "works as advertised".

## Changes

This diff simply adds a test in `vulkan_compute_api_test` to test whether the ComputeGraph API can handle the `encode_execute` function  being called multiple times.

Differential Revision: [D75013781](https://our.internmc.facebook.com/intern/diff/D75013781/)

[ghstack-poisoned]