Update on "[ET-VK] Minor build graph change to improve model load time and memory."

trivedivivek · trivedivivek · commit 1b1e4841826c · 2025-05-02T14:16:46.000-07:00
A minor change in GraphBuilder to avoid creating a temp vector and reserve memory while building operator. Differential Revision: [D73864959](https://our.internmc.facebook.com/intern/diff/D73864959/) [ghstack-poisoned]
diff --git a/backends/cadence/aot/reorder_ops.py b/backends/cadence/aot/reorder_ops.py
@@ -30,33 +30,35 @@
 
 # A list of ops that can be trivially quantized
 trivially_quantizable_ops_overloadpkt = {
-    torch.ops.aten.slice_copy,
-    torch.ops.aten.slice,
-    torch.ops.aten.view_copy,
-    torch.ops.aten.view,
-    torch.ops.aten.clone,
-    torch.ops.aten.transpose_copy,
-    torch.ops.aten.transpose,
-    torch.ops.aten.permute_copy,
-    torch.ops.aten.permute,
-    torch.ops.aten.squeeze_copy,
-    torch.ops.aten.squeeze,
-    torch.ops.aten.unsqueeze_copy,
-    torch.ops.aten.unsqueeze,
-    torch.ops.aten.chunk,
-    torch.ops.aten.contiguous,
-    torch.ops.aten.select_copy,
-    exir_ops.edge.aten.slice_copy,
-    exir_ops.edge.aten.view_copy,
+    exir_ops.edge.aten.chunk,
     exir_ops.edge.aten.clone,
-    exir_ops.edge.aten.transpose_copy,
+    exir_ops.edge.aten.contiguous,
+    exir_ops.edge.aten.expand_copy,
     exir_ops.edge.aten.permute_copy,
+    exir_ops.edge.aten.select_copy,
+    exir_ops.edge.aten.slice_copy,
     exir_ops.edge.aten.squeeze_copy,
-    exir_ops.edge.aten.unsqueeze_copy,
+    exir_ops.edge.aten.transpose_copy,
     exir_ops.edge.aten.unfold_copy,
-    exir_ops.edge.aten.chunk,
-    exir_ops.edge.aten.contiguous,
-    exir_ops.edge.aten.select_copy,
+    exir_ops.edge.aten.unsqueeze_copy,
+    exir_ops.edge.aten.view_copy,
+    torch.ops.aten.chunk,
+    torch.ops.aten.clone,
+    torch.ops.aten.contiguous,
+    torch.ops.aten.expand_copy,
+    torch.ops.aten.permute,
+    torch.ops.aten.permute_copy,
+    torch.ops.aten.select_copy,
+    torch.ops.aten.slice,
+    torch.ops.aten.slice_copy,
+    torch.ops.aten.squeeze,
+    torch.ops.aten.squeeze_copy,
+    torch.ops.aten.transpose,
+    torch.ops.aten.transpose_copy,
+    torch.ops.aten.unsqueeze,
+    torch.ops.aten.unsqueeze_copy,
+    torch.ops.aten.view,
+    torch.ops.aten.view_copy,
 }
 
 # slice-equivalent ops
diff --git a/backends/cadence/hifi/operators/operators.h b/backends/cadence/hifi/operators/operators.h
@@ -12,6 +12,11 @@
   _(uint8_t, Byte)                           \
   _(int8_t, Char)
 
+using ::executorch::aten::optional;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+
 namespace cadence {
 namespace impl {
 namespace HiFi {
@@ -36,6 +41,32 @@ ::executorch::aten::Tensor& div_out_mode(
     ::executorch::aten::optional<::executorch::aten::string_view> mode,
     ::executorch::aten::Tensor& out);
 
+void quantized_linear_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& in,
+    const Tensor& weight,
+    const Tensor& bias,
+    int64_t in_zero_point,
+    const Tensor& weight_zero_point,
+    const Tensor& out_multiplier,
+    const Tensor& out_shift,
+    int64_t out_zero_point,
+    __ET_UNUSED const optional<Tensor>& offset,
+    Tensor& out);
+
+void quantized_linear_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& in,
+    const Tensor& weight,
+    const Tensor& bias,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    int64_t out_multiplier,
+    int64_t out_shift,
+    int64_t out_zero_point,
+    __ET_UNUSED const optional<Tensor>& offset,
+    Tensor& out);
+
 } // namespace native
 } // namespace HiFi
 } // namespace impl
diff --git a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl
@@ -60,9 +60,9 @@ const lowp int out_packed_dim = unhash_packed_dim(out_layout);
 // First iteration of reduce will have 32 threads sum up 64 elements.
 // Second iteration will have 32 threads sum up 16 elements from previous iteration and so on.
 // Thus thread utilization starts at 100%.
-#define SHARED_MEMORY_FACTOR 2
+#define SHARED_MEMORY_FACTOR 1
 
-#define offset_pos_index(index) ((index) + ((index) >> 2))
+#define offset_pos_index(index) ((index) + ((index) >> 3))
 
 shared VEC4_T shared_input[offset_pos_index(MAX_WORKGROUP_SIZE * SHARED_MEMORY_FACTOR)];
 
@@ -154,14 +154,13 @@ void reduce_non_packed_dim() {
       if (all(lessThan(in_pos, out_limits))) {
         in_val = load_texel(t_in, in_pos);
       }
-      shared_input[offset_pos_index(shared_idx + si * gl_WorkGroupSize.x)] = in_val;
+      mean += in_val;
     }
-
-    reduce_input(width_stride, shared_idx_offset);
-    mean += shared_input[offset_pos_index(shared_idx_offset)];
   }
 
-  mean /= width;
+  shared_input[offset_pos_index(shared_idx)] = mean;
+  reduce_input(width_stride, shared_idx_offset);
+  mean = shared_input[offset_pos_index(shared_idx_offset)] / width;
 
   memoryBarrierShared();
   barrier();
@@ -178,14 +177,13 @@ void reduce_non_packed_dim() {
       }
 
       const VEC4_T delta = in_val - mean;
-      shared_input[offset_pos_index(shared_idx + si * gl_WorkGroupSize.x)] = delta * delta;
+      var += delta * delta;
     }
-
-    reduce_input(width_stride, shared_idx_offset);
-    var += shared_input[offset_pos_index(shared_idx_offset)];
   }
 
-  var /= width;
+  shared_input[offset_pos_index(shared_idx)] = var;
+  reduce_input(width_stride, shared_idx_offset);
+  var = shared_input[offset_pos_index(shared_idx_offset)] / width;
 
   VEC4_T rstd = pow(var + epsilon, VEC4_T(-0.5));
   VEC4_T offset = -rstd * mean;
@@ -226,6 +224,7 @@ void reduce_packed_dim() {
 
   const int in_pos_x_limit = out_limits[in_axis_map.x];
 
+  VEC4_T accum = VEC4_T(0);
   // Loop over the width in stride increments
   for (int width_offset = 0; width_offset <= last_packed_width_index; width_offset += width_stride) {
     // Read input in shared memory
@@ -244,20 +243,20 @@ void reduce_packed_dim() {
         in_val.z = mix(in_val.z, T(0), remain_inv > 1);
         in_val.w = mix(in_val.w, T(0), remain_inv > 0);
       }
-
-      shared_input[offset_pos_index(shared_idx + si * gl_WorkGroupSize.x)] = in_val;
+      accum += in_val;
     }
-
-    reduce_input(width_stride, shared_idx_offset);
-    const VEC4_T val = shared_input[offset_pos_index(shared_idx_offset)];
-    mean += val.x + val.y + val.z + val.w;
   }
 
-  mean /= width;
+  shared_input[offset_pos_index(shared_idx)] = accum;
+  reduce_input(width_stride, shared_idx_offset);
+  VEC4_T val = shared_input[offset_pos_index(shared_idx_offset)];
+  mean = (val.x + val.y + val.z + val.w) / width;
 
   memoryBarrierShared();
   barrier();
 
+  VEC4_T delta2 = VEC4_T(0);
+
   // Loop over the width in stride increments
   for (int width_offset = 0; width_offset <= last_packed_width_index; width_offset += width_stride) {
     // Read input in shared memory
@@ -278,16 +277,14 @@ void reduce_packed_dim() {
       }
 
       const VEC4_T delta = in_val - mean;
-      const VEC4_T delta2 = delta * delta;
-      shared_input[offset_pos_index(shared_idx + si * gl_WorkGroupSize.x)] = delta2;
+      delta2 += delta * delta;
     }
-
-    reduce_input(width_stride, shared_idx_offset);
-    const VEC4_T val = shared_input[offset_pos_index(shared_idx_offset)];
-    var += val.x + val.y + val.z + val.w;
   }
 
-  var /= width;
+  shared_input[offset_pos_index(shared_idx)] = delta2;
+  reduce_input(width_stride, shared_idx_offset);
+  val = shared_input[offset_pos_index(shared_idx_offset)];
+  var = (val.x + val.y + val.z + val.w) / width;
 
   T rstd = pow(var + epsilon, T(-0.5));
   T offset = -rstd * mean;
diff --git a/backends/xnnpack/quantizer/xnnpack_quantizer.py b/backends/xnnpack/quantizer/xnnpack_quantizer.py
@@ -292,6 +292,9 @@ def __init__(self) -> None:
         ] = {}
         self.module_type_config: dict[Callable, Optional[QuantizationConfig]] = {}
         self.module_name_config: dict[str, Optional[QuantizationConfig]] = {}
+        # If specified, only quantize nodes that return true for the filter
+        # function.
+        self.filter_fn: Optional[Callable[[Node], bool]] = None
 
     @classmethod
     def get_supported_quantization_configs(cls) -> list[QuantizationConfig]:
@@ -355,6 +358,14 @@ def set_module_name(
         self.module_name_config[module_name] = quantization_config
         return self
 
+    def set_filter_function(self, filter_fn: Callable[[Node], bool]):
+        """
+        Set the filter function. We only quantize nodes that return True for
+        the filter function.
+        """
+        self.filter_fn = filter_fn
+        return self
+
     def transform_for_annotation(
         self, model: torch.fx.GraphModule
     ) -> torch.fx.GraphModule:
@@ -378,17 +389,29 @@ def _annotate_all_patterns(
         if quantization_config is None:
             return model
 
+        # Create a combined filter function, which returns True only when
+        # both filter_fn and self.filter_fn return True.
+        def combined_filter_fn(n: Node) -> bool:
+            combined_filter = [self.filter_fn, filter_fn]
+            return all(f(n) for f in combined_filter if f is not None)
+
         for pattern in self.SUPPORTED_PATTERNS:
             if operator_target and operator_target not in pattern.op_overloads:
                 # if operator_target is specified, skip patterns that aren't
                 # associated with that target
                 continue
             if quantization_config.input_activation.is_dynamic and pattern.is_dynamic:
-                OP_TO_ANNOTATOR[pattern.name](model, quantization_config, filter_fn)
+                OP_TO_ANNOTATOR[pattern.name](
+                    model, quantization_config, combined_filter_fn
+                )
             elif quantization_config.is_qat and pattern.is_qat:
-                OP_TO_ANNOTATOR[pattern.name](model, quantization_config, filter_fn)
+                OP_TO_ANNOTATOR[pattern.name](
+                    model, quantization_config, combined_filter_fn
+                )
             elif not quantization_config.input_activation.is_dynamic:
-                OP_TO_ANNOTATOR[pattern.name](model, quantization_config, filter_fn)
+                OP_TO_ANNOTATOR[pattern.name](
+                    model, quantization_config, combined_filter_fn
+                )
 
         return model
 
diff --git a/backends/xnnpack/test/quantizer/test_xnnpack_quantizer.py b/backends/xnnpack/test/quantizer/test_xnnpack_quantizer.py
@@ -297,6 +297,36 @@ def test_obs_sharing_ops(self):
         ]
         self._test_quantizer(m, example_inputs, quantizer, node_occurrence, node_list)
 
+    def test_set_filter_fn(self):
+        quantizer = XNNPACKQuantizer()
+        quantization_config = get_symmetric_quantization_config(is_per_channel=True)
+        quantizer.set_global(quantization_config)
+        m_eager = TestHelperModules.TwoLinearModule().eval()
+
+        # Set the filter function so that the second linear is not quantized
+        def filter_fn(n):
+            return n.name != "linear_1"
+
+        quantizer.set_filter_function(filter_fn)
+
+        # Test with 2d inputs
+        example_inputs_2d = (torch.randn(9, 8),)
+        node_occurrence = {
+            # input and output of the first linear op will be (de)quantized
+            torch.ops.quantized_decomposed.quantize_per_tensor.default: 2,
+            torch.ops.quantized_decomposed.dequantize_per_tensor.default: 2,
+            # quantize_per_channel for weights are const propagated
+            torch.ops.quantized_decomposed.quantize_per_channel.default: 0,
+            # weight for the first linear will be dequantized
+            torch.ops.quantized_decomposed.dequantize_per_channel.default: 1,
+        }
+        self._test_quantizer(
+            m_eager,
+            example_inputs_2d,
+            quantizer,
+            node_occurrence,
+        )
+
     def test_set_module_name(self):
         class Sub(torch.nn.Module):
             def __init__(self) -> None:
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelType.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelType.java
@@ -14,4 +14,5 @@ public enum ModelType {
   LLAMA_3_2,
   LLAVA_1_5,
   LLAMA_GUARD_3,
+  QWEN_3,
 }
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java
@@ -29,6 +29,7 @@ public static int getModelCategory(ModelType modelType, BackendType backendType)
         case LLAMA_3:
         case LLAMA_3_1:
         case LLAMA_3_2:
+        case QWEN_3:
         default:
           return TEXT_MODEL;
       }
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java
@@ -25,6 +25,8 @@ public static String getSystemPromptTemplate(ModelType modelType) {
             + "<|eot_id|>";
       case LLAVA_1_5:
         return "USER: ";
+      case QWEN_3:
+        return "<|im_start|>system\n" + "You are a helpful assistant.\n" + "<|im_end|>\n";
       default:
         return SYSTEM_PLACEHOLDER;
     }
@@ -42,6 +44,14 @@ public static String getUserPromptTemplate(ModelType modelType) {
             + "<|start_header_id|>assistant<|end_header_id|>";
 
       case LLAVA_1_5:
+      case QWEN_3:
+        return "<|im_start|>user\n"
+            + USER_PLACEHOLDER
+            + "<|im_end|>\n"
+            + "<|im_start|>assistant\n"
+            + "<think>\n"
+            + "\n"
+            + "</think>\n\n\n";
       default:
         return USER_PLACEHOLDER;
     }
@@ -69,6 +79,8 @@ public static String getStopToken(ModelType modelType) {
         return "<|eot_id|>";
       case LLAVA_1_5:
         return "</s>";
+      case QWEN_3:
+        return "<|endoftext|>";
       default:
         return "";
     }
diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
@@ -308,6 +308,8 @@ Note for Mac users: There's a known linking issue with Xcode 15.1. Refer to the
 
 To build for CoreML backend and validate on Mac, replace `-DEXECUTORCH_BUILD_XNNPACK=ON` with `-DEXECUTORCH_BUILD_COREML=ON`
 
+If you an error about "RE2 failed to compile pattern with lookahead:...SUPPORT_REGEX_LOOKAHEAD=ON", add "-DSUPPORT_REGEX_LOOKAHEAD=ON" when building the runner.
+
 ## Step 4: Run benchmark on Android phone
 
 **1. Build llama runner binary for Android**
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.h b/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.h
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.mm b/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.mm
diff --git a/extension/apple/ExecuTorch/__tests__/TensorTest.swift b/extension/apple/ExecuTorch/__tests__/TensorTest.swift

Original file line number	Diff line number	Diff line change
`@@ -14,4 +14,5 @@ public enum ModelType {`
`14`	`14`	`LLAMA_3_2,`
`15`	`15`	`LLAVA_1_5,`
`16`	`16`	`LLAMA_GUARD_3,`
	`17`	`+ QWEN_3,`
`17`	`18`	`}`
Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,7 @@ public static int getModelCategory(ModelType modelType, BackendType backendType)`
`29`	`29`	`case LLAMA_3:`
`30`	`30`	`case LLAMA_3_1:`
`31`	`31`	`case LLAMA_3_2:`
	`32`	`+ case QWEN_3:`
`32`	`33`	`default:`
`33`	`34`	`return TEXT_MODEL;`
`34`	`35`	`}`