Update on "[ET-VK] Using vector for storing ref_mapping_ in GraphBuilder to improve model load time and memory."

trivedivivek · trivedivivek · commit 9d32edf3935e · 2025-05-02T14:16:47.000-07:00
This diff changes GraphBuilder class to store ref id to value mapping as vector instead of unordered map, since maximum id is known and thus vector can be sized to store the map. Differential Revision: [D73969916](https://our.internmc.facebook.com/intern/diff/D73969916/) [ghstack-poisoned]
diff --git a/backends/cadence/aot/reorder_ops.py b/backends/cadence/aot/reorder_ops.py
@@ -30,33 +30,35 @@
 
 # A list of ops that can be trivially quantized
 trivially_quantizable_ops_overloadpkt = {
-    torch.ops.aten.slice_copy,
-    torch.ops.aten.slice,
-    torch.ops.aten.view_copy,
-    torch.ops.aten.view,
-    torch.ops.aten.clone,
-    torch.ops.aten.transpose_copy,
-    torch.ops.aten.transpose,
-    torch.ops.aten.permute_copy,
-    torch.ops.aten.permute,
-    torch.ops.aten.squeeze_copy,
-    torch.ops.aten.squeeze,
-    torch.ops.aten.unsqueeze_copy,
-    torch.ops.aten.unsqueeze,
-    torch.ops.aten.chunk,
-    torch.ops.aten.contiguous,
-    torch.ops.aten.select_copy,
-    exir_ops.edge.aten.slice_copy,
-    exir_ops.edge.aten.view_copy,
+    exir_ops.edge.aten.chunk,
     exir_ops.edge.aten.clone,
-    exir_ops.edge.aten.transpose_copy,
+    exir_ops.edge.aten.contiguous,
+    exir_ops.edge.aten.expand_copy,
     exir_ops.edge.aten.permute_copy,
+    exir_ops.edge.aten.select_copy,
+    exir_ops.edge.aten.slice_copy,
     exir_ops.edge.aten.squeeze_copy,
-    exir_ops.edge.aten.unsqueeze_copy,
+    exir_ops.edge.aten.transpose_copy,
     exir_ops.edge.aten.unfold_copy,
-    exir_ops.edge.aten.chunk,
-    exir_ops.edge.aten.contiguous,
-    exir_ops.edge.aten.select_copy,
+    exir_ops.edge.aten.unsqueeze_copy,
+    exir_ops.edge.aten.view_copy,
+    torch.ops.aten.chunk,
+    torch.ops.aten.clone,
+    torch.ops.aten.contiguous,
+    torch.ops.aten.expand_copy,
+    torch.ops.aten.permute,
+    torch.ops.aten.permute_copy,
+    torch.ops.aten.select_copy,
+    torch.ops.aten.slice,
+    torch.ops.aten.slice_copy,
+    torch.ops.aten.squeeze,
+    torch.ops.aten.squeeze_copy,
+    torch.ops.aten.transpose,
+    torch.ops.aten.transpose_copy,
+    torch.ops.aten.unsqueeze,
+    torch.ops.aten.unsqueeze_copy,
+    torch.ops.aten.view,
+    torch.ops.aten.view_copy,
 }
 
 # slice-equivalent ops
diff --git a/backends/cadence/hifi/operators/operators.h b/backends/cadence/hifi/operators/operators.h
@@ -12,6 +12,11 @@
   _(uint8_t, Byte)                           \
   _(int8_t, Char)
 
+using ::executorch::aten::optional;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+
 namespace cadence {
 namespace impl {
 namespace HiFi {
@@ -36,6 +41,32 @@ ::executorch::aten::Tensor& div_out_mode(
     ::executorch::aten::optional<::executorch::aten::string_view> mode,
     ::executorch::aten::Tensor& out);
 
+void quantized_linear_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& in,
+    const Tensor& weight,
+    const Tensor& bias,
+    int64_t in_zero_point,
+    const Tensor& weight_zero_point,
+    const Tensor& out_multiplier,
+    const Tensor& out_shift,
+    int64_t out_zero_point,
+    __ET_UNUSED const optional<Tensor>& offset,
+    Tensor& out);
+
+void quantized_linear_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& in,
+    const Tensor& weight,
+    const Tensor& bias,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    int64_t out_multiplier,
+    int64_t out_shift,
+    int64_t out_zero_point,
+    __ET_UNUSED const optional<Tensor>& offset,
+    Tensor& out);
+
 } // namespace native
 } // namespace HiFi
 } // namespace impl
diff --git a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl
@@ -60,9 +60,9 @@ const lowp int out_packed_dim = unhash_packed_dim(out_layout);
 // First iteration of reduce will have 32 threads sum up 64 elements.
 // Second iteration will have 32 threads sum up 16 elements from previous iteration and so on.
 // Thus thread utilization starts at 100%.
-#define SHARED_MEMORY_FACTOR 2
+#define SHARED_MEMORY_FACTOR 1
 
-#define offset_pos_index(index) ((index) + ((index) >> 2))
+#define offset_pos_index(index) ((index) + ((index) >> 3))
 
 shared VEC4_T shared_input[offset_pos_index(MAX_WORKGROUP_SIZE * SHARED_MEMORY_FACTOR)];
 
@@ -154,14 +154,13 @@ void reduce_non_packed_dim() {
       if (all(lessThan(in_pos, out_limits))) {
         in_val = load_texel(t_in, in_pos);
       }
-      shared_input[offset_pos_index(shared_idx + si * gl_WorkGroupSize.x)] = in_val;
+      mean += in_val;
     }
-
-    reduce_input(width_stride, shared_idx_offset);
-    mean += shared_input[offset_pos_index(shared_idx_offset)];
   }
 
-  mean /= width;
+  shared_input[offset_pos_index(shared_idx)] = mean;
+  reduce_input(width_stride, shared_idx_offset);
+  mean = shared_input[offset_pos_index(shared_idx_offset)] / width;
 
   memoryBarrierShared();
   barrier();
@@ -178,14 +177,13 @@ void reduce_non_packed_dim() {
       }
 
       const VEC4_T delta = in_val - mean;
-      shared_input[offset_pos_index(shared_idx + si * gl_WorkGroupSize.x)] = delta * delta;
+      var += delta * delta;
     }
-
-    reduce_input(width_stride, shared_idx_offset);
-    var += shared_input[offset_pos_index(shared_idx_offset)];
   }
 
-  var /= width;
+  shared_input[offset_pos_index(shared_idx)] = var;
+  reduce_input(width_stride, shared_idx_offset);
+  var = shared_input[offset_pos_index(shared_idx_offset)] / width;
 
   VEC4_T rstd = pow(var + epsilon, VEC4_T(-0.5));
   VEC4_T offset = -rstd * mean;
@@ -226,6 +224,7 @@ void reduce_packed_dim() {
 
   const int in_pos_x_limit = out_limits[in_axis_map.x];
 
+  VEC4_T accum = VEC4_T(0);
   // Loop over the width in stride increments
   for (int width_offset = 0; width_offset <= last_packed_width_index; width_offset += width_stride) {
     // Read input in shared memory
@@ -244,20 +243,20 @@ void reduce_packed_dim() {
         in_val.z = mix(in_val.z, T(0), remain_inv > 1);
         in_val.w = mix(in_val.w, T(0), remain_inv > 0);
       }
-
-      shared_input[offset_pos_index(shared_idx + si * gl_WorkGroupSize.x)] = in_val;
+      accum += in_val;
     }
-
-    reduce_input(width_stride, shared_idx_offset);
-    const VEC4_T val = shared_input[offset_pos_index(shared_idx_offset)];
-    mean += val.x + val.y + val.z + val.w;
   }
 
-  mean /= width;
+  shared_input[offset_pos_index(shared_idx)] = accum;
+  reduce_input(width_stride, shared_idx_offset);
+  VEC4_T val = shared_input[offset_pos_index(shared_idx_offset)];
+  mean = (val.x + val.y + val.z + val.w) / width;
 
   memoryBarrierShared();
   barrier();
 
+  VEC4_T delta2 = VEC4_T(0);
+
   // Loop over the width in stride increments
   for (int width_offset = 0; width_offset <= last_packed_width_index; width_offset += width_stride) {
     // Read input in shared memory
@@ -278,16 +277,14 @@ void reduce_packed_dim() {
       }
 
       const VEC4_T delta = in_val - mean;
-      const VEC4_T delta2 = delta * delta;
-      shared_input[offset_pos_index(shared_idx + si * gl_WorkGroupSize.x)] = delta2;
+      delta2 += delta * delta;
     }
-
-    reduce_input(width_stride, shared_idx_offset);
-    const VEC4_T val = shared_input[offset_pos_index(shared_idx_offset)];
-    var += val.x + val.y + val.z + val.w;
   }
 
-  var /= width;
+  shared_input[offset_pos_index(shared_idx)] = delta2;
+  reduce_input(width_stride, shared_idx_offset);
+  val = shared_input[offset_pos_index(shared_idx_offset)];
+  var = (val.x + val.y + val.z + val.w) / width;
 
   T rstd = pow(var + epsilon, T(-0.5));
   T offset = -rstd * mean;
diff --git a/backends/xnnpack/quantizer/xnnpack_quantizer.py b/backends/xnnpack/quantizer/xnnpack_quantizer.py
@@ -292,6 +292,9 @@ def __init__(self) -> None:
         ] = {}
         self.module_type_config: dict[Callable, Optional[QuantizationConfig]] = {}
         self.module_name_config: dict[str, Optional[QuantizationConfig]] = {}
+        # If specified, only quantize nodes that return true for the filter
+        # function.
+        self.filter_fn: Optional[Callable[[Node], bool]] = None
 
     @classmethod
     def get_supported_quantization_configs(cls) -> list[QuantizationConfig]:
@@ -355,6 +358,14 @@ def set_module_name(
         self.module_name_config[module_name] = quantization_config
         return self
 
+    def set_filter_function(self, filter_fn: Callable[[Node], bool]):
+        """
+        Set the filter function. We only quantize nodes that return True for
+        the filter function.
+        """
+        self.filter_fn = filter_fn
+        return self
+
     def transform_for_annotation(
         self, model: torch.fx.GraphModule
     ) -> torch.fx.GraphModule:
@@ -378,17 +389,29 @@ def _annotate_all_patterns(
         if quantization_config is None:
             return model
 
+        # Create a combined filter function, which returns True only when
+        # both filter_fn and self.filter_fn return True.
+        def combined_filter_fn(n: Node) -> bool:
+            combined_filter = [self.filter_fn, filter_fn]
+            return all(f(n) for f in combined_filter if f is not None)
+
         for pattern in self.SUPPORTED_PATTERNS:
             if operator_target and operator_target not in pattern.op_overloads:
                 # if operator_target is specified, skip patterns that aren't
                 # associated with that target
                 continue
             if quantization_config.input_activation.is_dynamic and pattern.is_dynamic:
-                OP_TO_ANNOTATOR[pattern.name](model, quantization_config, filter_fn)
+                OP_TO_ANNOTATOR[pattern.name](
+                    model, quantization_config, combined_filter_fn
+                )
             elif quantization_config.is_qat and pattern.is_qat:
-                OP_TO_ANNOTATOR[pattern.name](model, quantization_config, filter_fn)
+                OP_TO_ANNOTATOR[pattern.name](
+                    model, quantization_config, combined_filter_fn
+                )
             elif not quantization_config.input_activation.is_dynamic:
-                OP_TO_ANNOTATOR[pattern.name](model, quantization_config, filter_fn)
+                OP_TO_ANNOTATOR[pattern.name](
+                    model, quantization_config, combined_filter_fn
+                )
 
         return model
 
diff --git a/backends/xnnpack/test/quantizer/test_xnnpack_quantizer.py b/backends/xnnpack/test/quantizer/test_xnnpack_quantizer.py
@@ -297,6 +297,36 @@ def test_obs_sharing_ops(self):
         ]
         self._test_quantizer(m, example_inputs, quantizer, node_occurrence, node_list)
 
+    def test_set_filter_fn(self):
+        quantizer = XNNPACKQuantizer()
+        quantization_config = get_symmetric_quantization_config(is_per_channel=True)
+        quantizer.set_global(quantization_config)
+        m_eager = TestHelperModules.TwoLinearModule().eval()
+
+        # Set the filter function so that the second linear is not quantized
+        def filter_fn(n):
+            return n.name != "linear_1"
+
+        quantizer.set_filter_function(filter_fn)
+
+        # Test with 2d inputs
+        example_inputs_2d = (torch.randn(9, 8),)
+        node_occurrence = {
+            # input and output of the first linear op will be (de)quantized
+            torch.ops.quantized_decomposed.quantize_per_tensor.default: 2,
+            torch.ops.quantized_decomposed.dequantize_per_tensor.default: 2,
+            # quantize_per_channel for weights are const propagated
+            torch.ops.quantized_decomposed.quantize_per_channel.default: 0,
+            # weight for the first linear will be dequantized
+            torch.ops.quantized_decomposed.dequantize_per_channel.default: 1,
+        }
+        self._test_quantizer(
+            m_eager,
+            example_inputs_2d,
+            quantizer,
+            node_occurrence,
+        )
+
     def test_set_module_name(self):
         class Sub(torch.nn.Module):
             def __init__(self) -> None:
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelType.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelType.java
@@ -14,4 +14,5 @@ public enum ModelType {
   LLAMA_3_2,
   LLAVA_1_5,
   LLAMA_GUARD_3,
+  QWEN_3,
 }
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java
@@ -29,6 +29,7 @@ public static int getModelCategory(ModelType modelType, BackendType backendType)
         case LLAMA_3:
         case LLAMA_3_1:
         case LLAMA_3_2:
+        case QWEN_3:
         default:
           return TEXT_MODEL;
       }
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java
@@ -25,6 +25,8 @@ public static String getSystemPromptTemplate(ModelType modelType) {
             + "<|eot_id|>";
       case LLAVA_1_5:
         return "USER: ";
+      case QWEN_3:
+        return "<|im_start|>system\n" + "You are a helpful assistant.\n" + "<|im_end|>\n";
       default:
         return SYSTEM_PLACEHOLDER;
     }
@@ -42,6 +44,14 @@ public static String getUserPromptTemplate(ModelType modelType) {
             + "<|start_header_id|>assistant<|end_header_id|>";
 
       case LLAVA_1_5:
+      case QWEN_3:
+        return "<|im_start|>user\n"
+            + USER_PLACEHOLDER
+            + "<|im_end|>\n"
+            + "<|im_start|>assistant\n"
+            + "<think>\n"
+            + "\n"
+            + "</think>\n\n\n";
       default:
         return USER_PLACEHOLDER;
     }
@@ -69,6 +79,8 @@ public static String getStopToken(ModelType modelType) {
         return "<|eot_id|>";
       case LLAVA_1_5:
         return "</s>";
+      case QWEN_3:
+        return "<|endoftext|>";
       default:
         return "";
     }
diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
@@ -308,6 +308,8 @@ Note for Mac users: There's a known linking issue with Xcode 15.1. Refer to the
 
 To build for CoreML backend and validate on Mac, replace `-DEXECUTORCH_BUILD_XNNPACK=ON` with `-DEXECUTORCH_BUILD_COREML=ON`
 
+If you an error about "RE2 failed to compile pattern with lookahead:...SUPPORT_REGEX_LOOKAHEAD=ON", add "-DSUPPORT_REGEX_LOOKAHEAD=ON" when building the runner.
+
 ## Step 4: Run benchmark on Android phone
 
 **1. Build llama runner binary for Android**
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.h b/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.h
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.mm b/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.mm
diff --git a/extension/apple/ExecuTorch/__tests__/TensorTest.swift b/extension/apple/ExecuTorch/__tests__/TensorTest.swift

Original file line number	Diff line number	Diff line change
`@@ -14,4 +14,5 @@ public enum ModelType {`
`14`	`14`	`LLAMA_3_2,`
`15`	`15`	`LLAVA_1_5,`
`16`	`16`	`LLAMA_GUARD_3,`
	`17`	`+ QWEN_3,`
`17`	`18`	`}`
Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,7 @@ public static int getModelCategory(ModelType modelType, BackendType backendType)`
`29`	`29`	`case LLAMA_3:`
`30`	`30`	`case LLAMA_3_1:`
`31`	`31`	`case LLAMA_3_2:`
	`32`	`+ case QWEN_3:`
`32`	`33`	`default:`
`33`	`34`	`return TEXT_MODEL;`
`34`	`35`	`}`