Merge branch 'main' into fix-fp-to-int-casting-lowering

YufengShi-dudu · web-flow · commit 58a0d0825724 · 2025-07-07T23:00:21.000+01:00
diff --git a/backends/qualcomm/quantizer/custom_annotation.py b/backends/qualcomm/quantizer/custom_annotation.py
@@ -20,6 +20,8 @@
 from torch.fx import Node
 from torchao.quantization.pt2e import FixedQParamsObserver, MinMaxObserver
 from torchao.quantization.pt2e.quantizer import (
+    annotate_input_qspec_map,
+    annotate_output_qspec,
     QuantizationAnnotation,
     QuantizationSpec,
     SharedQuantizationSpec,
@@ -213,6 +215,24 @@ def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None
             _annotated=True,
         )
 
+    def annotate_rms_norm(node: Node, quantization_config: QuantizationConfig) -> None:
+        act_node = node.args[0]
+        weight_node = node.args[2]
+
+        # TODO current only support 16a16w
+        annotate_input_qspec_map(
+            node,
+            act_node,
+            quantization_config.input_activation,
+        )
+
+        annotate_input_qspec_map(
+            node,
+            weight_node,
+            quantization_config.input_activation,
+        )
+        annotate_output_qspec(node, quantization_config.output_activation)
+
     def annotate_single_in_single_out(
         node: Node, quantization_config: QuantizationConfig
     ) -> None:
@@ -287,6 +307,9 @@ def annotate_matmul_input1(node: Node):
             elif node.target == torch.ops.aten.flatten.using_ints:
                 annotate_single_in_share_out(node, quantization_config_8a8w)
                 node = node.args[0]
+            elif node.target == torch.ops.aten.rms_norm.default:
+                annotate_rms_norm(node, quantization_config_8a8w)
+                node = node.args[0]
             elif node.target == torch.ops.aten.cat.default:
                 annotate_cat(node, quantization_config_8a8w)
                 # For v, we tag 8a until conv op.
diff --git a/examples/qualcomm/oss_scripts/llama/eval_llama_qnn.py b/examples/qualcomm/oss_scripts/llama/eval_llama_qnn.py
@@ -20,6 +20,14 @@
     annotate_matmul_16a8w,
 )
 
+from executorch.backends.qualcomm.quantizer.observers.per_channel_param_observer import (
+    PerChannelParamObserver,
+)
+from executorch.backends.qualcomm.quantizer.qconfig import (
+    _derived_bias_quant_spec,
+    QuantizationConfig,
+)
+
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
 from executorch.backends.qualcomm.utils.utils import convert_linear_to_conv2d
 
@@ -47,6 +55,8 @@
 
 from torchao.quantization.pt2e import MinMaxObserver
 from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
+from torchao.quantization.pt2e.quantizer import QuantizationSpec
+
 
 sys.setrecursionlimit(4096)
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
@@ -78,6 +88,33 @@ def forward(
         return self.model.forward(tokens, self.atten_mask)
 
 
+def add_mse_weight_observer(quant_dtype, quantizer):
+    weight_dtype = (
+        torch.int4
+        if quant_dtype in (QuantDtype.use_16a4w, QuantDtype.use_16a4w_block)
+        else torch.int8
+    )
+    per_channel_q_config = quantizer.default_quant_config.quant_config
+    weight_qspec = QuantizationSpec(
+        dtype=torch.int8 if weight_dtype == torch.int4 else weight_dtype,
+        quant_min=(
+            -7 if weight_dtype == torch.int4 else torch.iinfo(weight_dtype).min + 1
+        ),
+        quant_max=(7 if weight_dtype == torch.int4 else torch.iinfo(weight_dtype).max),
+        qscheme=torch.per_channel_symmetric,
+        ch_axis=0,
+        observer_or_fake_quant_ctr=PerChannelParamObserver.with_args(
+            **{"steps": 200, "use_mse": True}
+        ),
+    )
+    quantizer.default_quant_config.per_channel_quant_config = QuantizationConfig(
+        input_activation=per_channel_q_config.input_activation,
+        output_activation=per_channel_q_config.output_activation,
+        weight=weight_qspec,
+        bias=_derived_bias_quant_spec,
+    )
+
+
 def gen_eval_wrapper(model_name, args):
     tokenizer = get_tokenizer(args.tokenizer_path)
     with open(args.params) as f:
@@ -142,13 +179,13 @@ def permute(w, heads):
         if getattr(layer.feed_forward, "prepare_feedfoward_conv", None):
             layer.feed_forward.prepare_feedfoward_conv()
 
-    model.to(dtype=torch.bfloat16)
+    model.to(dtype=torch.float)
     model.to(device=args.device)
 
     tokens, atten_mask = model.get_example_inputs(use_kv_cache=False)
     tokens = tokens.to(device=args.device)
     atten_mask = atten_mask.to(device=args.device)
-    atten_mask = atten_mask.to(dtype=torch.bfloat16)
+    atten_mask = atten_mask.to(dtype=torch.float)
     inputs = (tokens, atten_mask)
 
     if args.embedding_quantize:
@@ -174,7 +211,8 @@ def permute(w, heads):
         )
         quantizer.add_custom_quant_annotations(custom_annotations)
 
-        model.has_quant_io = True
+        if args.range_setting == "mse_weight":
+            add_mse_weight_observer(quant_dtype, quantizer)
 
         with torch.no_grad():
             model = torch.export.export(model, inputs, strict=True).module()
@@ -245,6 +283,23 @@ def main() -> None:
     torch.manual_seed(seed)
     modelname = "llama2"
     parser = build_args_parser()
+    parser.add_argument(
+        "-P",
+        "--ptq",
+        help="If specified, will do PTQ quantization. default is 16bits activation and 4bits weight. Support 8a8w, 16a4w and 16a4w_block.",
+        type=str,
+    )
+    parser.add_argument(
+        "--range_setting",
+        help="Choose which range setting method (e.g. mse_weight). If not specified, will do minmax for weights and activations",
+        type=str,
+    )
+    parser.add_argument(
+        "--limit",
+        help="the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples",
+        type=str,
+    )
+
     args = parser.parse_args()
     args.llama_model = "llama3_2"
     # Overrides this arg, because evaluation requires full logits.
@@ -257,15 +312,9 @@ def main() -> None:
     args.use_kv_cache = False
     args.prefill_ar_len = args.max_seq_length
 
-    # To do fewer samples for faster evaluation
-    args.limit = 0.1
-    # args.samples = {'wikitext': list(range(1))}
-
     args.device = "cuda" if torch.cuda.is_available() else "cpu"
     torch.set_default_device(args.device)
 
-    args.ptq = "8a8w"
-
     eval_llama(modelname, args)
 
 
diff --git a/exir/passes/remove_mixed_type_operators.py b/exir/passes/remove_mixed_type_operators.py
@@ -23,12 +23,20 @@ def call_operator(self, op, args, kwargs, meta: NodeMetadata):  # noqa: C901
         promotion_type_allow_list = {
             torch.ops.aten.add.Tensor: ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
             torch.ops.aten.mul.Tensor: ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
-            torch.ops.aten.div.Tensor: ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+            # The correct promotion for div depends on the mode! If there is no mode,
+            # it's INT_TO_FLOAT, otherwise it's default.
+            torch.ops.aten.div.Tensor: ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+            torch.ops.aten.div.Tensor_mode: ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
             torch.ops.aten.minimum.default: ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
         }
 
         if op in promotion_type_allow_list:
             promotion_kind = promotion_type_allow_list[op]
+            if (
+                op == torch.ops.aten.div.Tensor_mode
+                and kwargs.get("rounding_mode") is None
+            ):
+                promotion_kind = ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
         else:
             # Not in allow list, do nothing
             return super().call_operator(op, args, kwargs, meta)
diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py
@@ -9,7 +9,7 @@
 import os
 import tempfile
 import unittest
-from typing import List, Optional, Tuple
+from typing import Callable, List, Optional, Tuple
 
 import executorch.exir as exir
 
@@ -71,6 +71,7 @@
 from functorch.experimental import control_flow
 
 from torch import nn
+from torch._prims_common import ELEMENTWISE_TYPE_PROMOTION_KIND
 from torch.export import export
 from torch.export.graph_signature import InputKind, InputSpec, TensorArgument
 from torch.fx import GraphModule, subgraph_rewriter
@@ -121,91 +122,114 @@ def foo_out(
     return a + 1, None
 
 
+def simple_promote_dtype(
+    dtype: torch.dtype, promotion_kind: ELEMENTWISE_TYPE_PROMOTION_KIND
+) -> torch.dtype:
+    if promotion_kind == ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT:
+        return dtype
+    if promotion_kind == ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT:
+        return dtype if dtype.is_floating_point else torch.float
+    else:
+        raise Exception(f"Unsupported promotion kind {promotion_kind}")
+
+
+def count_nodes_with_target_asserting_arguments_have_dtype(
+    self, module, target, arg_dtype
+) -> int:
+    count = 0
+    for node in module.graph.nodes:
+        if node.op == "call_function" and node.target == target:
+            count += 1
+            for arg in node.args:
+                self.assertEqual(arg.meta["val"].dtype, arg_dtype)
+    return count
+
+
 class TestPasses(unittest.TestCase):
     @classmethod
     def setUpClass(cls) -> None:
         register_additional_test_aten_ops()
 
     def test_remove_mixed_type_operators(self) -> None:
-        class Add(torch.nn.Module):
-            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-                return (x + y) + x
-
-        add = Add()
-
-        int_tensor = torch.tensor([[1, 2, 3]])
-        float_tensor = torch.tensor([[1.0, 2.0, 3.0]])
-        edge_prog = to_edge(export(add, (int_tensor, float_tensor), strict=True))
-
-        new_prog = edge_prog.transform([RemoveMixedTypeOperators()])
-        new_graph_module = new_prog.exported_program().graph_module
-        self.assertIsNotNone(new_graph_module)
-
-        add_count = 0
-
-        for node in new_graph_module.graph.nodes:
-            if (
-                node.op == "call_function"
-                and node.target == exir_ops.edge.aten.add.Tensor
-            ):
-                add_count += 1
-                node_args = node.args
-                for arg in node_args:
-                    self.assertEqual(arg.meta["val"].dtype, torch.float)
-
-        self.assertEqual(add_count, 2)
-
-        double_tensor = torch.tensor([[1.0, 2.0, 3.0]])
-        double_tensor = double_tensor.to(torch.double)
-
-        double_prog = to_edge(export(add, (int_tensor, double_tensor), strict=True))
-
-        double_prog.transform([RemoveMixedTypeOperators()])
-        new_graph_module_double = double_prog.exported_program().graph_module
-        self.assertIsNotNone(new_graph_module_double)
-
-        add_count_double = 0
-
-        for node in new_graph_module_double.graph.nodes:
-            if (
-                node.op == "call_function"
-                and node.target == exir_ops.edge.aten.add.Tensor
-            ):
-                add_count_double += 1
-                node_args = node.args
-                for arg in node_args:
-                    self.assertEqual(arg.meta["val"].dtype, torch.double)
-
-        self.assertEqual(add_count_double, 2)
-
-        class Mult(torch.nn.Module):
-            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-                return x * y
-
-        mult = Mult()
-
-        float_tensor_vert = float_tensor.T
-        mult_prog = to_edge(export(mult, (int_tensor, float_tensor_vert), strict=True))
-
-        # graph_module_mult.graph.print_tabular()
+        def make_module(fwd: Callable[[torch.Tensor, torch.Tensor], torch.Tensor]):
+            class Module(torch.nn.Module):
+                def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                    return fwd(x, y)
+
+            return Module
+
+        Add = make_module(lambda x, y: (x + y) + x)
+        Mult = make_module(lambda x, y: x * y)
+        Minimum = make_module(torch.minimum)
+        DivWithoutMode = make_module(torch.div)
+        DivWithNoneMode = make_module(lambda x, y: torch.div(x, y, rounding_mode=None))
+        DivWithTruncMode = make_module(
+            lambda x, y: torch.div(x, y, rounding_mode="trunc")
+        )
+        DivWithFloorMode = make_module(
+            lambda x, y: torch.div(x, y, rounding_mode="floor")
+        )
 
-        mult_prog = mult_prog.transform([RemoveMixedTypeOperators()])
-        new_graph_module_mult = mult_prog.exported_program().graph_module
-        self.assertIsNotNone(new_graph_module_mult)
+        for module, op, expected_count, promotion_kind in (
+            (
+                Add,
+                exir_ops.edge.aten.add.Tensor,
+                2,
+                ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+            ),
+            (
+                Mult,
+                exir_ops.edge.aten.mul.Tensor,
+                1,
+                ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+            ),
+            (
+                Minimum,
+                exir_ops.edge.aten.minimum.default,
+                1,
+                ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+            ),
+            (
+                DivWithoutMode,
+                exir_ops.edge.aten.div.Tensor,
+                1,
+                ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+            ),
+            (
+                DivWithNoneMode,
+                exir_ops.edge.aten.div.Tensor_mode,
+                1,
+                ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+            ),
+            (
+                DivWithTruncMode,
+                exir_ops.edge.aten.div.Tensor_mode,
+                1,
+                ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+            ),
+            (
+                DivWithFloorMode,
+                exir_ops.edge.aten.div.Tensor_mode,
+                1,
+                ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+            ),
+        ):
+            for second_arg_dtype in (torch.int64, torch.float, torch.double):
+                int_tensor = torch.tensor([[1, 2, 3]], dtype=torch.int64)
+                float_tensor = torch.tensor([[1.0, 2.0, 3.0]], dtype=second_arg_dtype)
+                edge_prog = to_edge(
+                    export(module(), (int_tensor, float_tensor), strict=True)
+                )
 
-        mult_count = 0
+                new_prog = edge_prog.transform([RemoveMixedTypeOperators()])
+                new_graph_module = new_prog.exported_program().graph_module
+                self.assertIsNotNone(new_graph_module)
 
-        for node in new_graph_module_mult.graph.nodes:
-            if (
-                node.op == "call_function"
-                and node.target == exir_ops.edge.aten.mul.Tensor
-            ):
-                mult_count += 1
-                node_args = node.args
-                for arg in node_args:
-                    self.assertEqual(arg.meta["val"].dtype, torch.float)
-
-        self.assertEqual(mult_count, 1)
+                promoted_type = simple_promote_dtype(second_arg_dtype, promotion_kind)
+                count = count_nodes_with_target_asserting_arguments_have_dtype(
+                    self, new_graph_module, op, promoted_type
+                )
+                self.assertEqual(count, expected_count)
 
     def test_remove_noop_pass(self) -> None:
         class Foo(torch.nn.Module):
diff --git a/kernels/optimized/cpu/op_log_softmax.cpp b/kernels/optimized/cpu/op_log_softmax.cpp
diff --git a/tools/cmake/resolve_buck.py b/tools/cmake/resolve_buck.py