pytorch
diff --git a/‎.buckconfig‎
Lines changed: 1 addition & 0 deletions b/‎.buckconfig‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/cadence/aot/fuse_ops.py‎
Lines changed: 56 additions & 5 deletions b/‎backends/cadence/aot/fuse_ops.py‎
Lines changed: 56 additions & 5 deletions
diff --git a/‎backends/cadence/aot/tests/test_fusion_ops_passes.py‎
Lines changed: 44 additions & 2 deletions b/‎backends/cadence/aot/tests/test_fusion_ops_passes.py‎
Lines changed: 44 additions & 2 deletions
diff --git a/‎examples/arm/example_modules/README.md‎
Lines changed: 7 additions & 0 deletions b/‎examples/arm/example_modules/README.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎examples/arm/example_modules/add.py‎
Lines changed: 13 additions & 0 deletions b/‎examples/arm/example_modules/add.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎examples/models/llama/export_llama_lib.py‎
Lines changed: 14 additions & 38 deletions b/‎examples/models/llama/export_llama_lib.py‎
Lines changed: 14 additions & 38 deletions
diff --git a/‎examples/models/llama/source_transformation/quantize.py‎
Lines changed: 26 additions & 29 deletions b/‎examples/models/llama/source_transformation/quantize.py‎
Lines changed: 26 additions & 29 deletions
diff --git a/‎examples/models/llava/export_llava.py‎
Lines changed: 10 additions & 1 deletion b/‎examples/models/llava/export_llava.py‎
Lines changed: 10 additions & 1 deletion
@@ -39,6 +39,7 @@
 
 [buck2]
 restarter=true
+file_watcher=notify
 
 [oss]
 folly_cxx_tests = False
@@ -814,11 +814,61 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
 
 
 @register_cadence_pass(CadencePassAttribute(opt_level=1))
-class FuseMulIntoDequantPass(ExportPass):
+class FuseMulScalarIntoDequantPass(ExportPass):
     """
-    Looks for the pattern where atem.mul is multiplying the outputs of dequantize
-    and aten.full. If found, updates the dequant scale to reflect the multiplication
-    and removes the full and mul nodes.
+    Looks for the pattern where aten.mul.Scalar is multiplying the
+     outputs of dequantize. If found, updates the dequant scale
+    to reflect the multiplication and removes the mul node.
+    """
+
+    def attempt_fusion(
+        self, graph_module: torch.fx.GraphModule, node: torch.fx.Node
+    ) -> None:
+        if node.target not in {
+            exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            exir_ops.edge.cadence.dequantize_per_tensor.default,
+        }:
+            return
+
+        # ensure that the single user of dequant is aten.mul.Scalar
+        user = list(node.users.keys())[0]
+        if len(node.users) != 1 or user.target != exir_ops.edge.aten.mul.Scalar:
+            return
+
+        # ensure that the other arg to mul is a node (i.e. not a constant)
+        if len(user.args) > 1 and isinstance(user.args[1], torch.fx.Node):
+            return
+
+        new_deq_args = list(node.args)
+        assert isinstance(node.args[1], Number)
+        assert isinstance(user.args[1], Number)
+        # pyre-ignore[58]: Unsupported operand *
+        new_deq_args[1] = node.args[1] * user.args[1]
+
+        logging.debug(
+            f"Fused {node} and {user} into {node}. Updated scale from {node.args[1]} to {new_deq_args[1]}"
+        )
+
+        user.replace_all_uses_with(node)
+        node.args = tuple(new_deq_args)
+
+        graph_module.graph.erase_node(user)
+
+        graph_module.recompile()
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        for node in graph_module.graph.nodes:
+            self.attempt_fusion(graph_module, node)
+        result = super().call(graph_module)
+        return result
+
+
+@register_cadence_pass(CadencePassAttribute(opt_level=1))
+class FuseMulTensorIntoDequantPass(ExportPass):
+    """
+    Looks for the pattern where aten.mul is multiplying the outputs of dequantize
+    and aten.full, or vice versa. If found, updates the dequant scale to reflect
+    the multiplication and removes the full and mul nodes.
     """
 
     def attempt_fusion(
@@ -1017,7 +1067,8 @@ class CadenceFuseOpsInGraph:
         FuseCascadedTransposeOrPermuteOps,
         FuseCascadedViewOps,
         FuseQuantDequantToRequantizePass,
-        FuseMulIntoDequantPass,
+        FuseMulTensorIntoDequantPass,
+        FuseMulScalarIntoDequantPass,
         FuseFullThenReshapePass,
         FuseTransposeOrPermuteOpPairsPass,
     ]
@@ -19,7 +19,8 @@
 )
 from executorch.backends.cadence.aot.fuse_ops import (
     FuseFullThenReshapePass,
-    FuseMulIntoDequantPass,
+    FuseMulScalarIntoDequantPass,
+    FuseMulTensorIntoDequantPass,
     FuseQuantDequantToRequantizePass,
     FuseTransposeOrPermuteOpPairsPass,
 )
@@ -446,7 +447,7 @@ def forward(self, x):
 
         inputs = (torch.randint(0, 255, [4, 32], dtype=torch.uint8),)
         graph_module = export_to_edge(M(), inputs).exported_program().graph_module
-        graph_module = FuseMulIntoDequantPass()(graph_module).graph_module
+        graph_module = FuseMulTensorIntoDequantPass()(graph_module).graph_module
 
         # verify that the mul and full ops were removed
         self.check_op_counts(
@@ -467,6 +468,47 @@ def forward(self, x):
                 deq_scale = node.args[1]
         self.assertEqual(deq_scale, 4.5)
 
+    def test_fuse_mul_scalar_into_dequant(self):
+        dequant_scale = 0.006
+        mul_value = 0.3
+
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(2, 3, 4, dtype=torch.float32))
+        quant = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+            args=(x, 1, 0, -128, 127, torch.int8),
+        )
+        dequant = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            args=(quant, dequant_scale, 5, -128, 127, torch.int8),
+        )
+        mul_scalar = builder.call_operator(
+            op=exir_ops.edge.aten.mul.Scalar,
+            args=(dequant, mul_value),
+        )
+        builder.output(mul_scalar)
+        graph_module = builder.get_graph_module()
+
+        graph_module = FuseMulScalarIntoDequantPass()(graph_module).graph_module
+
+        # verify that the mul and full ops were removed
+        self.check_op_counts(
+            graph_module,
+            expected_op_counts={
+                exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: 1,
+                exir_ops.edge.aten.mul.Scalar: 0,
+            },
+        )
+
+        # verify that the dequant scale value was updated correctly
+        for node in graph_module.graph.nodes:
+            if (
+                node.target
+                == exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default
+            ):
+                deq_scale = node.args[1]
+        self.assertEqual(deq_scale, dequant_scale * mul_value)
+
     def test_fuse_then_transpose_pass(self):
         # Create a graph with full -> transpose.
         builder = GraphBuilder()
 
@@ -0,0 +1,7 @@
+# Example of an external model for the ARM AOT Compiler
+Example of an external Python file to be used as a module by the `run.sh` (and the `aot_arm_compiler.py`) scripts in `examples/arm` directory. 
+Just pass the path of the `add.py` file as `--model_name`:
+
+`ModelUnderTest` should be a `torch.nn.module` instance.
+
+`ModelInputs` should be a tuple of inputs to the forward function.
@@ -0,0 +1,13 @@
+import torch
+
+
+class myModelAdd(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return x + x
+
+
+ModelUnderTest = myModelAdd()
+ModelInputs = (torch.ones(5),)
@@ -1273,6 +1273,7 @@ def _get_source_transforms(  # noqa
     preq_mode: Optional[str] = None,
     preq_group_size: Optional[int] = None,
     preq_embedding_quantize: Optional[str] = None,
+    local_global_attention: Optional[List[int]] = None,
 ) -> List[Callable[[torch.nn.Module], torch.nn.Module]]:
     """
     Return a list of functions that transform a graph.
@@ -1340,23 +1341,11 @@ def _get_source_transforms(  # noqa
         transformations based on the given checkpoint first. In those cases,
         this wil be a no-op.
         """
-
-        # Create a mock args object with the necessary attributes
-        class Args:
-            pass
-
-        args = Args()
-        args.checkpoint = checkpoint
-        args.tokenizer_path = tokenizer_path
-        args.embedding_quantize = embedding_quantize
-        args.use_shared_embedding = use_shared_embedding
-        args.use_qat = use_qat
-        args.use_lora = use_lora
-        args.preq_mode = preq_mode
-        args.preq_group_size = preq_group_size
-        args.preq_embedding_quantize = preq_embedding_quantize
-
-        transforms.append(get_quant_embedding_transform(args, checkpoint_dtype))
+        transforms.append(
+            get_quant_embedding_transform(
+                embedding_quantize, use_shared_embedding, checkpoint_dtype
+            )
+        )
 
     # quantization_mode should be applied after embedding_quantize
     # to support shared_embedding
@@ -1374,30 +1363,17 @@ class Args:
         There are cases where this may be a no-op, namely, if all linears are
         quantized in the checkpoint.
         """
-
-        # Create a mock args object with the necessary attributes
-        class Args:
-            pass
-
-        args = Args()
-        args.checkpoint = checkpoint
-        args.tokenizer_path = tokenizer_path
-        args.quantization_mode = quantization_mode
-        args.group_size = group_size
-        args.use_shared_embedding = use_shared_embedding
-        args.calibration_tasks = calibration_tasks
-        args.calibration_limit = calibration_limit
-        args.calibration_seq_length = calibration_seq_length
-        args.use_shared_embedding = use_shared_embedding
-        args.use_qat = use_qat
-        args.use_lora = use_lora
-        args.preq_mode = preq_mode
-
         transforms.append(
             get_quant_weight_transform(
-                args=args,
+                quantization_mode=quantization_mode,
+                group_size=group_size,
                 computation_dtype=dtype_override,
                 checkpoint_dtype=checkpoint_dtype,
+                checkpoint_path=checkpoint,
+                tokenizer_path=tokenizer_path,
+                calibration_tasks=calibration_tasks,
+                calibration_limit=calibration_limit,
+                calibration_seq_length=calibration_seq_length,
             )
         )
 
@@ -1467,7 +1443,7 @@ class Args:
     if vulkan:
         transforms.append(replace_with_vulkan_rotary_emb)
 
-    if getattr(args, "local_global_attention", None) is not None:
+    if local_global_attention:
         transforms.append(
             partial(
                 replace_kv_cache_with_ring_kv_cache,
 
@@ -41,7 +41,7 @@ def quantize(  # noqa C901
     checkpoint_dtype: Optional[DType] = None,
     checkpoint_path: Optional[Path] = None,
     # following arguments only available when setting int4 or gptq quantization.
-    group_size: Optional[int] = 128,
+    group_size: Optional[int] = None,
     # following arguments are only used for GPTQ
     calibration_tasks: Optional[list] = None,
     calibration_limit: Optional[int] = None,
@@ -146,9 +146,9 @@ def quantize(  # noqa C901
             print("quantized model:", model)
         return model
     elif qmode == "8da4w":
-        # Check for required args
         if group_size is None:
-            raise Exception("For 8da4w quantization, group size must be specified.")
+            # TODO: Default value for group size for 8da4w. Need this here for refactor, will clean this up.
+            group_size = 128
 
         from torchao.quantization import int8_dynamic_activation_int4_weight, quantize_
         from torchao.utils import unwrap_tensor_subclass
@@ -784,16 +784,20 @@ def forward(self, indices: torch.Tensor) -> torch.Tensor:
 ############################ Source Transform Start #######################
 
 
-def get_quant_embedding_transform(args, dtype_override: Optional[DType] = None):
-    if args.embedding_quantize.startswith("torchao:"):
+def get_quant_embedding_transform(
+    embedding_quantize: str,
+    use_shared_embedding: bool = False,
+    dtype_override: Optional[DType] = None,
+):
+    if embedding_quantize.startswith("torchao:"):
         from torchao.experimental.quant_api import (
             EmbeddingQuantizer,
             SharedEmbeddingQuantizer,
         )
         from torchao.quantization.granularity import PerAxis, PerGroup
         from torchao.quantization.quant_api import MappingType
 
-        quant_args = args.embedding_quantize.split(":")[1].split(",")
+        quant_args = embedding_quantize.split(":")[1].split(",")
         if len(quant_args) == 2:
             bitwidth, group_size = quant_args
             is_asymmetric = True
@@ -814,7 +818,7 @@ def get_quant_embedding_transform(args, dtype_override: Optional[DType] = None):
 
         def _torchao_embedding_quantizer(model):
             with torch.no_grad():
-                if not args.use_shared_embedding:
+                if not use_shared_embedding:
                     EmbeddingQuantizer(
                         weight_dtype=weight_dtype,
                         granularity=granularity,
@@ -831,7 +835,7 @@ def _torchao_embedding_quantizer(model):
 
         return _torchao_embedding_quantizer
 
-    bitwidth, group_size = args.embedding_quantize.split(",")
+    bitwidth, group_size = embedding_quantize.split(",")
     if group_size == "none" or group_size == "None" or group_size == "0":
         group_size = None
     else:
@@ -848,34 +852,27 @@ def _torchao_embedding_quantizer(model):
 
 
 def get_quant_weight_transform(
-    args,
+    quantization_mode: str,
+    group_size: Optional[int] = None,
     computation_dtype: Optional[DType] = None,
     checkpoint_dtype: Optional[DType] = None,
+    checkpoint_path: Optional[Path] = None,
+    tokenizer_path: Optional[Path] = None,
+    calibration_tasks: Optional[list] = None,
+    calibration_limit: Optional[int] = None,
+    calibration_seq_length: Optional[int] = None,
 ):
-    # If these optional args are None, don't provide them to quantize().
-    quant_args_str = [
-        "group_size",
-        "calibration_tasks",
-        "calibration_limit",
-        "calibration_seq_length",
-    ]
-    arg_dict = vars(args)
-    quant_args = {
-        param: val
-        for param in quant_args_str
-        if (val := arg_dict.get(param)) is not None
-    }
-
     return partial(
         quantize,
-        **quant_args,
-        qmode=args.quantization_mode,
+        qmode=quantization_mode,
         computation_dtype=computation_dtype,
         checkpoint_dtype=checkpoint_dtype,
-        checkpoint_path=(Path(path) if (path := args.checkpoint) is not None else None),
-        tokenizer_path=(
-            Path(path) if (path := args.tokenizer_path) is not None else None
-        ),
+        checkpoint_path=(Path(path) if (path := checkpoint_path) is not None else None),
+        group_size=group_size,
+        calibration_tasks=calibration_tasks,
+        calibration_limit=calibration_limit,
+        calibration_seq_length=calibration_seq_length,
+        tokenizer_path=(Path(path) if (path := tokenizer_path) is not None else None),
     )
 
 
 
@@ -107,7 +107,16 @@ def forward(self, input_pos, embeddings):
             "4,32",
         ]
     )
-    quant_transform = get_quant_weight_transform(args, dtype_override)
+    quant_transform = get_quant_weight_transform(
+        quantization_mode=args.quantization_mode,
+        group_size=args.group_size,
+        computation_dtype=dtype_override,
+        checkpoint_path=args.checkpoint_path,
+        tokenizer_path=args.tokenizer_path,
+        calibration_tasks=args.calibration_tasks,
+        calibration_limit=args.calibration_limit,
+        calibration_seq_length=args.calibration_seq_length,
+    )
     _, quantizers, _ = get_quantizer_and_quant_params(args)
     source_transforms = []
     if llava.use_sdpa_with_kv_cache_op: