Merge branch 'main' into gh/trivedivivek/108/orig

trivedivivek · web-flow · commit b657769545d2 · 2025-06-02T23:11:30.000-05:00
diff --git a/backends/xnnpack/operators/node_visitor.py b/backends/xnnpack/operators/node_visitor.py
@@ -274,19 +274,46 @@ def get_per_channel_dtype(
 
         return dtype
 
-    def get_quant_params(self, quant_params: QuantParams) -> XNNQuantParams:
+    def get_quant_params(
+        self, quant_params: QuantParams, xnn_graph: XNNGraph
+    ) -> XNNQuantParams:
         if quant_params.per_channel:
             scale = cast(torch.Tensor, quant_params.scale)
+            buffer_idx = len(xnn_graph.constant_data)
+            num_scales = scale.numel()
+
+            if quant_params.is_per_channel_group:
+                scale = scale.to(torch.bfloat16)
+
+            num_bytes = scale.untyped_storage().nbytes()
+            scale_array = ctypes.cast(
+                scale.untyped_storage().data_ptr(),
+                ctypes.POINTER(ctypes.c_char * num_bytes),
+            ).contents
+            scale_name = hashlib.sha256(bytes(scale_array)).hexdigest()
+            xnn_graph.constant_data.append(
+                ConstantDataOffset(
+                    offset=UINT64_MAX, size=num_bytes, named_key=scale_name
+                )
+            )
+            self._named_data_store.add_named_data(
+                scale_name, bytes(scale_array), CONSTANT_TENSOR_ALIGNMENT
+            )
+
             if quant_params.is_per_channel_group:
                 return PerChannelGroupQuant(
-                    scale=scale.flatten().tolist(),
+                    scale=[],
                     channel_dim=quant_params.axis,
                     group_size=quant_params.group_size,
+                    scale_buffer_idx=buffer_idx,
+                    num_scales=num_scales,
                 )
-            else:  # per_channel quant
+            else:
                 return PerChannelQuant(
-                    scale=scale.tolist(),
+                    scale=[],
                     channel_dim=quant_params.axis,
+                    scale_buffer_idx=buffer_idx,
+                    num_scales=num_scales,
                 )
         elif quant_params.is_dynamic:
             # NB:
@@ -449,7 +476,7 @@ def define_tensor(  # noqa: C901
             else XValue(
                 xvalue_union=XNNQuantizedTensorValue(
                     tensor_value=tvalue,
-                    quant_params=self.get_quant_params(quant_params),
+                    quant_params=self.get_quant_params(quant_params, xnn_graph),
                 )
             )
         )
diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp
@@ -421,11 +421,32 @@ Error defineTensor(
             qparams->channel_dim(),
             dtype,
             zero_point);
+
+        const float* scale = qparams->scale()->data();
+
+        if (qparams->scale_buffer_idx() != 0) {
+          // if scales are stored in named data, then retrieve it
+          ConstantDataOffsetPtr scale_buffer_offset =
+              flatbuffer_graph->constant_data()->Get(
+                  qparams->scale_buffer_idx());
+          const std::string& data_name =
+              scale_buffer_offset->named_key()->str();
+          Result<FreeableBuffer> scale_buffer =
+              named_data_map->get_data(data_name.c_str());
+          ET_CHECK_OR_RETURN_ERROR(
+              scale_buffer.ok(),
+              Internal,
+              "Failed to get constant data for key %s from named_data_map. Error code: %u",
+              data_name.c_str(),
+              static_cast<uint32_t>(scale_buffer.error()));
+          scale = reinterpret_cast<const float*>(scale_buffer.get().data());
+          freeable_buffers.push_back(std::move(scale_buffer.get()));
+        }
         status = xnn_define_channelwise_quantized_tensor_value_v2(
             /*subgraph=*/subgraph_ptr,
             /*datatype=*/dtype,
             /*zero_point=*/zero_point,
-            /*scale=*/qparams->scale()->data(),
+            /*scale=*/scale,
             /*num_dims=*/tensor_value->num_dims(),
             /*channel_dim*/ qparams->channel_dim(),
             /*dims=*/dims_data.data(),
@@ -452,10 +473,24 @@ Error defineTensor(
 
         // Block scales are preferably serialized as bf16 but can also be
         // serialized as fp32 for backwards compatability.
-        if (qparams->scale_bf16() != nullptr) {
+        if (qparams->scale_buffer_idx() != 0) {
+          ConstantDataOffsetPtr scale_buffer_offset =
+              flatbuffer_graph->constant_data()->Get(
+                  qparams->scale_buffer_idx());
+          const std::string& data_name =
+              scale_buffer_offset->named_key()->str();
+          Result<FreeableBuffer> scale_buffer =
+              named_data_map->get_data(data_name.c_str());
+          ET_CHECK_OR_RETURN_ERROR(
+              scale_buffer.ok(),
+              Internal,
+              "Failed to get constant data for key %s from named_data_map. Error code: %u",
+              data_name.c_str(),
+              static_cast<uint32_t>(scale_buffer.error()));
           scale_data =
-              static_cast<const uint16_t*>(qparams->scale_bf16()->data());
-          scale_numel = qparams->scale_bf16()->size();
+              reinterpret_cast<const uint16_t*>(scale_buffer.get().data());
+          freeable_buffers.push_back(std::move(scale_buffer.get()));
+          scale_numel = qparams->num_scales();
         } else {
           // Read fp32 scales, convert to bf16.
           auto conv_buffer = static_cast<uint16_t*>(allocator.allocateTemporary(
diff --git a/backends/xnnpack/serialization/runtime_schema.fbs b/backends/xnnpack/serialization/runtime_schema.fbs
@@ -48,6 +48,8 @@ table Buffer {
 table PerChannelQuant {
   scale:[float];
   channel_dim:int;
+  scale_buffer_idx: uint;
+  num_scales: uint;
 }
 
 table PerTokenDynamicQuant {
@@ -63,7 +65,9 @@ table PerChannelGroupQuant {
   scale:[float];
   channel_dim:int;
   group_size:int;
-  scale_bf16:[ushort];
+  scale_bf16:[ushort] (deprecated);
+  scale_buffer_idx: uint;
+  num_scales: uint;
 }
 
 table XNNTensorValue {
diff --git a/backends/xnnpack/serialization/schema.fbs b/backends/xnnpack/serialization/schema.fbs
@@ -48,12 +48,16 @@ table PerChannelGroupQuant {
   scale:[float];
   channel_dim:int;
   group_size:int;
-  scale_bf16:[ushort];
+  scale_bf16:[ushort] (deprecated);
+  scale_buffer_idx: uint;
+  num_scales: uint;
 }
 
 table PerChannelQuant {
   scale:[float];
   channel_dim:int;
+  scale_buffer_idx: uint;
+  num_scales: uint;
 }
 
 table PerTokenDynamicQuant {
diff --git a/backends/xnnpack/serialization/xnnpack_graph_schema.py b/backends/xnnpack/serialization/xnnpack_graph_schema.py
@@ -425,13 +425,23 @@ class XNNDatatype(IntEnum):
 class PerChannelQuant:
     scale: List[float]
     channel_dim: int
+    scale_buffer_idx: int = -1
+    num_scales: int = -1
+
+
+@dataclass
+class Buffer:
+    storage: bytes
 
 
 @dataclass
 class PerChannelGroupQuant:
     scale: List[float]
     channel_dim: int
     group_size: int = 1
+    scale_bf16: Optional[List[float]] = None
+    scale_buffer_idx: int = -1
+    num_scales: int = -1
 
 
 @dataclass
diff --git a/devtools/inspector/_intermediate_output_capturer.py b/devtools/inspector/_intermediate_output_capturer.py
@@ -7,24 +7,57 @@
 # pyre-unsafe
 
 
-from typing import Any, Dict, Tuple
+from typing import Any, Dict, List, Tuple
 
 import torch
 from torch.fx import GraphModule
 from torch.fx.interpreter import Interpreter
 
 
+class NodeFilter:
+    """
+    A class used to filter nodes based on extensible criteria.
+    Attributes:
+        metadata_key (str): The key to look for in the node's metadata.
+        op_type (str): The operation code to match.
+        exclude_ops (List[str]): A list of operations to exclude from the filter.
+    """
+
+    def __init__(self, metadata_key: str, op_type: str, exclude_ops: List[str] = None):
+        self.metadata_key = metadata_key
+        self.op_type = op_type
+        self.exclude_ops = exclude_ops
+
+    def matches(self, node: torch.fx.Node) -> bool:
+        return (
+            node.meta.get(self.metadata_key) is not None
+            and node.op == self.op_type
+            and all(exclude_name not in node.name for exclude_name in self.exclude_ops)
+        )
+
+
 class IntermediateOutputCapturer(Interpreter):
+    """
+    A class that captures intermediate outputs from a PyTorch graph module.
+    Attributes:
+        module (GraphModule): The graph module to capture outputs from.
+        node_filters (List[NodeFilter]): A list of filters to apply to the nodes.
+    """
+
     def __init__(self, module: GraphModule):
         super().__init__(module)
+        self.node_filters = [
+            NodeFilter("debug_handle", "call_function", exclude_ops=["getitem"])
+        ]
 
+    # Runs the graph module and captures the intermediate outputs.
     def run_and_capture(self, *args, **kwargs) -> Dict[Tuple[int, ...], Any]:
         captured_outputs = {}
 
         def capture_run_node(n: torch.fx.Node) -> Any:
             result = super(IntermediateOutputCapturer, self).run_node(n)
-            debug_handle = n.meta.get("debug_handle", None)
-            if debug_handle is not None and n.op == "call_function":
+            if all(filter.matches(n) for filter in self.node_filters):
+                debug_handle = n.meta["debug_handle"]
                 # Convert the debug handle to a tuple to use as a dictionary key
                 key = (
                     (debug_handle,)
diff --git a/devtools/inspector/tests/intermediate_output_capturer_test.py b/devtools/inspector/tests/intermediate_output_capturer_test.py
@@ -111,8 +111,6 @@ def test_capture_correct_outputs(self):
             (19,): torch.tensor([[3.6000, 4.5067]]),
             (20,): torch.tensor([[0.9734, 0.9891]]),
             (21,): [torch.tensor([[0.9734]]), torch.tensor([[0.9891]])],
-            (22,): torch.tensor([[0.9734]]),
-            (23,): torch.tensor([[0.9891]]),
         }
         self.assertEqual(
             len(self.intermediate_outputs), len(expected_outputs_with_handles)
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
@@ -8,16 +8,14 @@
 import re
 from functools import partial
 from pathlib import Path
-from typing import Any, Dict, Optional
+from typing import Dict, Optional
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
 from executorch.extension.llm.export.builder import DType
 
-from sentencepiece import SentencePieceProcessor
-
 
 try:
     from fairseq2.nn.embedding import (
@@ -57,7 +55,7 @@ def quantize(  # noqa C901
 
     Args:
         model: The model to quantize.
-        qmode: The quantization mode, e.g. int8, 8da4w, 8da4w-gptq.
+        qmode: The quantization mode, e.g. int8, 8da4w.
         computation_dtype: The dtype that ops are performed in (the resulting dtype of dequantization).
             Also the dtype of the rest of the non-quantized compoents of the model.
         checkpoint_dtype: The dtype of the checkpoint, this arg exists since it is more accurate to
@@ -161,58 +159,6 @@ def quantize(  # noqa C901
         if verbose:
             print("quantized model:", model)
         return model
-    elif qmode == "8da4w-gptq":
-        # Check for required args
-        required_args: Optional[Any] = [
-            group_size,
-            calibration_limit,
-            calibration_seq_length,
-        ]
-        if any(arg is None for arg in required_args):
-            raise Exception(
-                "For 8da4w-gptq quantization, group size, calibration limit and calibration sequence length must be specified."
-            )
-        if calibration_tasks is None:
-            calibration_tasks = ["wikitext"]
-
-        try:
-            # torchao 0.3+
-            from torchao._models._eval import InputRecorder
-        except ImportError:
-            from torchao.quantization.GPTQ import InputRecorder  # pyre-ignore
-
-        from torchao.quantization.quant_api import Int8DynActInt4WeightGPTQQuantizer
-
-        if tokenizer_path is None:
-            assert checkpoint_path is not None, "checkpoint_path must be specified"
-            tokenizer_path = checkpoint_path.parent / "tokenizer.model"
-        assert tokenizer_path.is_file(), tokenizer_path
-        tokenizer = SentencePieceProcessor(  # pyre-ignore[28]
-            model_file=str(tokenizer_path)
-        )
-
-        inputs = (
-            InputRecorder(  # pyre-fixme[16]
-                tokenizer,
-                calibration_seq_length,
-                None,  # input_prep_func
-                pad_calibration_inputs,
-                model.vocab_size,
-            )
-            .record_inputs(
-                calibration_tasks,
-                calibration_limit,
-            )
-            .get_inputs()
-        )
-
-        gptq_quantizer = Int8DynActInt4WeightGPTQQuantizer(
-            blocksize,
-            percdamp,
-            group_size,
-        )  # TODO: separate computation and checkpoint dtype for GPTQ.
-        model = gptq_quantizer.quantize(model, inputs)
-        return model
     elif qmode == "vulkan_4w":
         from executorch.backends.vulkan._passes import VkInt4WeightOnlyQuantizer
 
diff --git a/extension/threadpool/cpuinfo_utils.cpp b/extension/threadpool/cpuinfo_utils.cpp
@@ -16,6 +16,10 @@
 
 #include <executorch/runtime/platform/assert.h>
 
+#if defined(__APPLE__) && defined(__aarch64__)
+#include <sys/sysctl.h>
+#endif
+
 namespace executorch::extension::cpuinfo {
 
 // Ignore revisions (last digit (4 LSBs))
@@ -33,6 +37,11 @@ bool is_non_performant_core(const struct cpuinfo_uarch_info* uarch_info) {
     case cpuinfo_uarch_cortex_a53:
     case cpuinfo_uarch_cortex_a510:
     case cpuinfo_uarch_icestorm:
+    case cpuinfo_uarch_blizzard:
+    case cpuinfo_uarch_sawtooth:
+    case cpuinfo_uarch_coll_sawtooth:
+    case cpuinfo_uarch_tupai_sawtooth:
+    case cpuinfo_uarch_tahiti_sawtooth:
       return true;
     // This can be so many other cores.
     // Need to update this to better account for slow cores
@@ -167,6 +176,23 @@ uint32_t get_num_performant_cores() {
     // In one plua 12 while it has 2 little cores, the topology
     // reported in /sys/devices/system/cpu/cpu* /topology/core_siblings_list
     // report wrong topology which results in wront configratuon
+#if defined(__aarch64__) && defined(__APPLE__)
+    // Copied from ATen/ParallelCommon.cpp
+    // On Apple Silicon there are efficient and performance core
+    // Restrict parallel algorithms to performance cores by default
+    int32_t num_cores = -1;
+    size_t num_cores_len = sizeof(num_cores);
+    if (sysctlbyname(
+            "hw.perflevel0.physicalcpu",
+            &num_cores,
+            &num_cores_len,
+            nullptr,
+            0) == 0) {
+      if (num_cores > 1) {
+        return static_cast<uint32_t>(num_cores);
+      }
+    }
+#endif
     return _get_num_performant_cores();
   }
 }

Original file line number	Diff line number	Diff line change
`@@ -111,8 +111,6 @@ def test_capture_correct_outputs(self):`
`111`	`111`	`(19,): torch.tensor([[3.6000, 4.5067]]),`
`112`	`112`	`(20,): torch.tensor([[0.9734, 0.9891]]),`
`113`	`113`	`(21,): [torch.tensor([[0.9734]]), torch.tensor([[0.9891]])],`
`114`		`- (22,): torch.tensor([[0.9734]]),`
`115`		`- (23,): torch.tensor([[0.9891]]),`
`116`	`114`	`}`
`117`	`115`	`self.assertEqual(`
`118`	`116`	`len(self.intermediate_outputs), len(expected_outputs_with_handles)`