pytorch · SS-JIA · Oct 17, 2024 · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -285,7 +285,8 @@ ValueRef ComputeGraph::add_tensor_like(
 ValueRef ComputeGraph::add_tensor_like(
     const ValueRef idx,
     const utils::GPUMemoryLayout memory_layout) {
-  return add_tensor(sizes_of(idx), dtype_of(idx), memory_layout);
+  return add_tensor(
+      sizes_of(idx), dtype_of(idx), storage_type_of(idx), memory_layout);
 }
 
 ValueRef ComputeGraph::add_tensor(

diff --git a/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp
@@ -18,9 +18,10 @@
 
 namespace vkcompute {
 
-ValueRef prepack_arg(
+ValueRef check_and_prepack_arg(
     ComputeGraph& graph,
     ValueRef arg_ref,
+    const utils::StorageType stype,
     int64_t num_channels,
     const std::string& debug_name) {
   VK_CHECK_COND(
@@ -33,7 +34,7 @@ ValueRef prepack_arg(
   // batch_norm's param are broadcasted on the channel dimension.
   // In this implementation, we pack the weights along the x dimension, and
   // in the shader, we lookup using the along the x.
-  return prepack_if_tensor_ref(graph, arg_ref, utils::kWidthPacked);
+  return prepack_standard(graph, arg_ref, stype, utils::kWidthPacked);
 }
 
 void add_native_batch_norm_node(
@@ -51,22 +52,26 @@ void add_native_batch_norm_node(
   VK_CHECK_COND(in_sizes.size() == 4, "BatchNorm only support 4d tensor");
   VK_CHECK_COND(out_sizes.size() == 4, "BatchNorm only support 4d tensor");
 
+  // Only the first element of the return value is propagated. The remaining 2
+  // elements are zero-size dummy tensor.
+  ValueRef out_ref = graph.get_value_list(out_tuple_ref)->at(0);
+
+  utils::StorageType stype = graph.storage_type_of(out_ref);
+
   int64_t num_channels = dim_at<kChannel4D>(in_sizes);
 
-  ValueRef arg_weight = prepack_arg(graph, weight_ref, num_channels, "weight");
-  ValueRef arg_bias = prepack_arg(graph, bias_ref, num_channels, "bias");
-  ValueRef arg_mean = prepack_arg(graph, mean_ref, num_channels, "mean");
-  ValueRef arg_var = prepack_arg(graph, var_ref, num_channels, "var");
+  ValueRef arg_weight =
+      check_and_prepack_arg(graph, weight_ref, stype, num_channels, "weight");
+  ValueRef arg_bias =
+      check_and_prepack_arg(graph, bias_ref, stype, num_channels, "bias");
+  ValueRef arg_mean =
+      check_and_prepack_arg(graph, mean_ref, stype, num_channels, "mean");
+  ValueRef arg_var =
+      check_and_prepack_arg(graph, var_ref, stype, num_channels, "var");
   float epsilon = graph.extract_scalar<float>(eps_ref);
 
   vTensorPtr t_in = graph.get_tensor(in_ref);
 
-  // Only the first element of the return value is propagated. The remaining 2
-  // elements are zero-size dummy tensor.
-  const auto out_tuple_val = graph.get_value_list(out_tuple_ref);
-
-  ValueRef out_ref = out_tuple_val->at(0);
-
   VK_CHECK_COND(!graph.val_is_tref(out_ref), "Output should not be tref");
   vTensorPtr t_out = graph.get_tensor(out_ref);
 

diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
@@ -51,9 +51,8 @@ void add_binary_op_node(
     const ValueRef alpha,
     const ValueRef out,
     const std::string& op_name) {
-  ValueRef arg1 = prepack_if_tensor_ref(graph, in1);
-  ValueRef arg2 =
-      prepack_if_tensor_ref(graph, in2, graph.estimate_memory_layout_of(arg1));
+  ValueRef arg1 = prepack_standard_like(graph, in1, out, true);
+  ValueRef arg2 = prepack_standard_like(graph, in2, out, true);
 
   vTensorPtr t_in1 = graph.get_tensor(arg1);
   vTensorPtr t_in2 = graph.get_tensor(arg2);

diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -304,7 +304,7 @@ utils::uvec3 create_conv2d_global_wg_size(
 void add_conv2d_node(
     ComputeGraph& graph,
     const ValueRef in,
-    const ValueRef weight,
+    const ValueRef weight_data,
     const ValueRef bias,
     const ValueRef stride,
     const ValueRef padding,
@@ -330,19 +330,18 @@ void add_conv2d_node(
   const int64_t groups_val = graph.get_int(groups);
 
   const Conv2dMethod method =
-      get_conv2d_method(graph, weight, groups_val, transposed_val);
+      get_conv2d_method(graph, weight_data, groups_val, transposed_val);
 
-  ValueRef arg_in = prepack_if_tensor_ref(graph, in);
-  ValueRef arg_weight = prepack_weights(graph, weight, method);
+  ValueRef arg_weight = prepack_weights(graph, weight_data, method);
   ValueRef arg_bias = prepack_biases(
       graph,
       bias,
-      weight,
+      weight_data,
       transposed_val,
       /* storage_type = */ utils::kTexture2D,
       /* memory_layout = */ utils::kWidthPacked);
 
-  vTensorPtr t_in = graph.get_tensor(arg_in);
+  vTensorPtr t_in = graph.get_tensor(in);
   vTensorPtr t_out = graph.get_tensor(out);
   if (t_in->sizes().at(0) > 1) {
     VK_THROW("conv2d: input batch size > 1 is not supported yet!");
@@ -351,20 +350,25 @@ void add_conv2d_node(
 
   Kernel2dParams kernel_params = create_kernel2d_params(
       graph,
-      weight,
+      weight_data,
       /*kernel_size_only = */ false,
       stride,
       padding,
       dilation);
   Conv2dParams extra_params =
-      create_conv2d_params(graph, weight, kernel_params, transposed_val);
+      create_conv2d_params(graph, weight_data, kernel_params, transposed_val);
 
   OutputParams out_params = {out_min_val, out_max_val};
 
   check_conv2d_params(kernel_params, transposed_val);
 
   vkapi::ShaderInfo shader = get_conv2d_shader(
-      graph, *t_out, /*prepack_weights = */ false, method, weight, clamp_out);
+      graph,
+      *t_out,
+      /*prepack_weights = */ false,
+      method,
+      weight_data,
+      clamp_out);
 
   graph.execute_nodes().emplace_back(new DispatchNode(
       graph,
@@ -373,7 +377,7 @@ void add_conv2d_node(
       graph.create_local_wg_size(out),
       // Inputs and Outputs
       {{out, vkapi::MemoryAccessType::WRITE},
-       {{arg_in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
+       {{in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
       {
           t_out->logical_limits_ubo(),
@@ -386,7 +390,7 @@ void add_conv2d_node(
       {},
       // Resizing Logic
       resize_conv2d_node,
-      {weight, stride, padding, dilation, transposed, output_padding}));
+      {weight_data, stride, padding, dilation, transposed, output_padding}));
 }
 
 void add_conv1d_node(
@@ -402,9 +406,8 @@ void add_conv1d_node(
     const ValueRef out_max,
     const ValueRef out,
     const bool clamp_out) {
-  ValueRef arg_in = prepack_if_tensor_ref(graph, in);
-  ValueRef arg_weight =
-      prepack_if_tensor_ref(graph, weight, utils::kWidthPacked);
+  ValueRef arg_weight = prepack_standard(
+      graph, weight, graph.storage_type_of(out), utils::kWidthPacked);
   ValueRef arg_bias = prepack_biases(
       graph,
       bias,
@@ -422,7 +425,7 @@ void add_conv1d_node(
     out_max_val = graph.extract_scalar<float>(out_max);
   }
 
-  vTensorPtr t_in = graph.get_tensor(arg_in);
+  vTensorPtr t_in = graph.get_tensor(in);
   vTensorPtr t_weight = graph.get_tensor(arg_weight);
   vTensorPtr t_bias = graph.get_tensor(arg_bias);
   vTensorPtr t_out = graph.get_tensor(out);
@@ -471,7 +474,7 @@ void add_conv1d_node(
       local_size,
       // Inputs and Outputs
       {{out, vkapi::MemoryAccessType::WRITE},
-       {{arg_in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
+       {{in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
       {
           t_out->logical_limits_ubo(),

diff --git a/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp b/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp
@@ -57,9 +57,9 @@ void add_embedding_node(
 }
 
 void embedding(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  ValueRef weight = prepack_if_tensor_ref(graph, args[0]);
-  ValueRef in = prepack_if_tensor_ref(graph, args[1]);
+  ValueRef in = args[1];
   ValueRef out = args[5];
+  ValueRef weight = prepack_standard_like(graph, args[0], out);
 
   add_embedding_node(graph, weight, in, out);
 }

diff --git a/backends/vulkan/runtime/graph/ops/impl/IndexSelect.cpp b/backends/vulkan/runtime/graph/ops/impl/IndexSelect.cpp
@@ -108,9 +108,9 @@ int64_t get_dim_idx(ComputeGraph& graph, ValueRef in, ValueRef dim_ref) {
 }
 
 void index_select(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  ValueRef in = prepack_if_tensor_ref(graph, args[0]);
+  ValueRef in = args[0];
   ValueRef dim_ref = args[1];
-  ValueRef idx = prepack_if_tensor_ref(graph, args[2]);
+  ValueRef idx = args[2];
   ValueRef out = args[3];
 
   const int64_t dim_idx = get_dim_idx(graph, in, dim_ref);

diff --git a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
@@ -94,8 +94,11 @@ void add_addmm_naive_node(
     const ValueRef out,
     const Params& params,
     const ValueRef mat2_is_transposed) {
-  ValueRef self = prepack_if_tensor_ref(graph, self_data, utils::kWidthPacked);
-  ValueRef mat2 = prepack_if_tensor_ref(graph, mat2_data, utils::kHeightPacked);
+  utils::StorageType stype = graph.storage_type_of(out);
+  ValueRef self = prepack_standard(
+      graph, self_data, stype, utils::kWidthPacked, /*passthrough = */ true);
+  ValueRef mat2 = prepack_standard(
+      graph, mat2_data, stype, utils::kHeightPacked, /*passthrough = */ true);
 
   std::string kernel_name =
       graph.get_bool(mat2_is_transposed) ? "linear_naive" : "addmm_naive";
@@ -145,9 +148,11 @@ void add_addmm_optimized_node(
     const ValueRef out,
     const Params& params,
     const ValueRef mat2_is_transposed) {
-  ValueRef self =
-      prepack_if_tensor_ref(graph, self_data, utils::kChannelsPacked);
-  ValueRef mat2 = prepack_if_tensor_ref(graph, mat2_data, utils::kHeightPacked);
+  utils::StorageType stype = graph.storage_type_of(out);
+  ValueRef self = prepack_standard(
+      graph, self_data, stype, utils::kChannelsPacked, /*passthrough=*/true);
+  ValueRef mat2 = prepack_standard(
+      graph, mat2_data, stype, utils::kHeightPacked, /*passthrough=*/true);
 
   // Ensure mat1 is width packed
   ValueRef mat1_W_packed = graph.add_tensor_like(mat1, utils::kWidthPacked);
@@ -276,8 +281,8 @@ void linear(ComputeGraph& graph, const std::vector<ValueRef>& args) {
   ValueRef weight_data = args.at(1);
   ValueRef bias = args.at(2);
   ValueRef out = args.at(3);
-  ValueRef weight =
-      prepack_if_tensor_ref(graph, weight_data, utils::kWidthPacked);
+  ValueRef weight = prepack_standard(
+      graph, weight_data, graph.storage_type_of(out), utils::kWidthPacked);
   ValueRef mat2_is_transposed = graph.add_scalar(true);
   if (graph.val_is_none(bias)) {
     return add_matmul_node(graph, input, weight, out, mat2_is_transposed);

diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
@@ -62,7 +62,12 @@ void add_matmul_naive_buffer_node(
     const ValueRef mat2_data,
     const ValueRef out,
     const ValueRef mat2_is_transposed) {
-  ValueRef mat2 = prepack_if_tensor_ref(graph, mat2_data, utils::kHeightPacked);
+  ValueRef mat2 = prepack_standard(
+      graph,
+      mat2_data,
+      graph.storage_type_of(out),
+      utils::kHeightPacked,
+      /*passthrough = */ true);
 
   std::string kernel_name = "matmul_naive_buffer";
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
@@ -103,7 +108,12 @@ void add_matmul_naive_texture3d_node(
     const ValueRef mat2_data,
     const ValueRef out,
     const ValueRef mat2_is_transposed) {
-  ValueRef mat2 = prepack_if_tensor_ref(graph, mat2_data, utils::kHeightPacked);
+  ValueRef mat2 = prepack_standard(
+      graph,
+      mat2_data,
+      graph.storage_type_of(out),
+      utils::kHeightPacked,
+      /*passthrough = */ true);
 
   std::string kernel_name = graph.get_bool(mat2_is_transposed)
       ? "matmul_transposed_naive"
@@ -146,7 +156,12 @@ void add_matmul_optimized_node(
     const ValueRef mat2_data,
     const ValueRef out,
     const ValueRef mat2_is_transposed) {
-  ValueRef mat2 = prepack_if_tensor_ref(graph, mat2_data, utils::kHeightPacked);
+  ValueRef mat2 = prepack_standard(
+      graph,
+      mat2_data,
+      graph.storage_type_of(out),
+      utils::kHeightPacked,
+      /*passthrough = */ true);
 
   // Ensure mat1 is width packed
   ValueRef mat1_W_packed = graph.add_tensor_like(mat1, utils::kWidthPacked);

diff --git a/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp
@@ -57,8 +57,8 @@ void add_native_layer_norm_node(
     ComputeGraph& graph,
     const ValueRef in,
     const ValueRef normalized_shape,
-    const ValueRef weight,
-    const ValueRef bias,
+    const ValueRef weight_data,
+    const ValueRef bias_data,
     const ValueRef eps,
     const ValueRef out) {
   const auto normalized_shape_dim =
@@ -67,19 +67,16 @@ void add_native_layer_norm_node(
     VK_THROW("native_layer_norm only supports normalized_shape with dim == 1");
   }
 
-  if (graph.val_is_none(weight)) {
+  if (graph.val_is_none(weight_data)) {
     VK_THROW("native_layer_norm requires weight to be non-None");
   }
 
-  if (graph.val_is_none(bias)) {
+  if (graph.val_is_none(bias_data)) {
     VK_THROW("native_layer_norm requires bias to be non-None");
   }
 
-  ValueRef arg_in = prepack_if_tensor_ref(graph, in);
-  ValueRef arg_weight = prepack_if_tensor_ref(
-      graph, weight, graph.estimate_memory_layout_of(arg_in));
-  ValueRef arg_bias = prepack_if_tensor_ref(
-      graph, bias, graph.estimate_memory_layout_of(arg_in));
+  ValueRef arg_weight = prepack_standard_like(graph, weight_data, in);
+  ValueRef arg_bias = prepack_standard_like(graph, bias_data, in);
 
   const auto out_val = graph.get_value_list(out);
   vTensorPtr t_out = graph.get_tensor(out_val->at(0));
@@ -107,7 +104,7 @@ void add_native_layer_norm_node(
       // Inputs and Outputs
       {{{out_val->at(0), out_val->at(1), out_val->at(2)},
         vkapi::MemoryAccessType::WRITE},
-       {{arg_in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
+       {{in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
       {t_out->logical_limits_ubo(),
        t_out->sizes_ubo(),

diff --git a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
@@ -71,8 +71,7 @@ void add_max_pool2d_node(
     const ValueRef dilation,
     const ValueRef ceil_mode,
     const ValueRef out) {
-  ValueRef arg = prepack_if_tensor_ref(graph, in);
-  vTensorPtr t_in = graph.get_tensor(arg);
+  vTensorPtr t_in = graph.get_tensor(in);
 
   const auto out_val = graph.get_value_list(out);
   vTensorPtr t_out = graph.get_tensor(out_val->at(0));
@@ -100,7 +99,7 @@ void add_max_pool2d_node(
       local_size,
       // Inputs and Outputs
       {{{out_val->at(0), out_val->at(1)}, vkapi::MemoryAccessType::WRITE},
-       {arg, vkapi::MemoryAccessType::READ}},
+       {in, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
       {
           t_out->logical_limits_ubo(),
@@ -149,8 +148,7 @@ void add_avg_pool2d_node(
     const ValueRef count_include_pad,
     const ValueRef divisor_override,
     const ValueRef out) {
-  ValueRef arg = prepack_if_tensor_ref(graph, in);
-  vTensorPtr t_in = graph.get_tensor(arg);
+  vTensorPtr t_in = graph.get_tensor(in);
   vTensorPtr t_out = graph.get_tensor(out);
 
   check_pool2d_args(*t_in, *t_out);
@@ -174,7 +172,7 @@ void add_avg_pool2d_node(
       local_size,
       // Inputs and Outputs
       {{out, vkapi::MemoryAccessType::WRITE},
-       {arg, vkapi::MemoryAccessType::READ}},
+       {in, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
       {t_out->logical_limits_ubo(),
        t_in->sizes_ubo(),