Merge branch 'main' into gh/SS-JIA/229/orig

SS-JIA · web-flow · commit dab24ec944d1 · 2025-06-02T20:04:03.000-04:00
diff --git a/.github/scripts/label_utils.py b/.github/scripts/label_utils.py
@@ -22,9 +22,7 @@
 
 LABEL_ERR_MSG_TITLE = "This PR needs a `release notes:` label"
 LABEL_ERR_MSG = f"""# {LABEL_ERR_MSG_TITLE}
-If your change should be included in the release notes (i.e. would users of this library care about this change?), please use a label starting with `release notes:`.
-
-If not, please add the `release notes: none` label.
+If your change should be included in the release notes (i.e. would users of this library care about this change?), please use a label starting with `release notes:`. This helps us keep track and include your important work in the next release notes.
 
 To add a label, you can comment to pytorchbot, for example
 `@pytorchbot label "release notes: none"`
diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py
@@ -59,12 +59,7 @@
     patterns_to_regex,
     retries_decorator,
 )
-from label_utils import (
-    gh_add_labels,
-    gh_remove_label,
-    has_required_labels,
-    LABEL_ERR_MSG,
-)
+from label_utils import gh_add_labels, gh_remove_label
 from trymerge_explainer import get_revert_message, TryMergeExplainer
 
 # labels
@@ -2116,9 +2111,6 @@ def merge(
     # Check for approvals
     find_matching_merge_rule(pr, repo, skip_mandatory_checks=True)
 
-    if not has_required_labels(pr):
-        raise RuntimeError(LABEL_ERR_MSG.lstrip(" #"))
-
     if ignore_current:
         checks = pr.get_checkrun_conclusions()
         _, failing, _ = categorize_checks(
diff --git a/.github/workflows/check-labels.yml b/.github/workflows/check-labels.yml
@@ -51,4 +51,4 @@ jobs:
           PR_NUM: ${{ github.event.number || github.event.inputs.pr_number }}
         run: |
           set -ex
-          python3 .github/scripts/check_labels.py --exit-non-zero "${PR_NUM}"
+          python3 .github/scripts/check_labels.py "${PR_NUM}"
diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py
@@ -2065,11 +2065,10 @@ def call_operator(
         return super().call_operator(op, args, kwargs, meta)
 
 
-@register_cadence_pass(CadencePassAttribute(opt_level=2))
-class ReplaceGeluWithApproximateGeluPass(ExportPass):
+@register_cadence_pass(CadencePassAttribute(opt_level=0))
+class ReplaceAtenApproxGeluWithApproxGeluPass(ExportPass):
     """
-    Replace the gelu op with an approximate gelu op. The approximate gelu op
-    is more efficient on DSP backends.
+    Replace the aten gelu op with an approximate arg with an approximate gelu op.
     """
 
     def call_operator(
@@ -2079,6 +2078,9 @@ def call_operator(
         kwargs: Dict[str, Argument],
         meta: NodeMetadata,
     ) -> ProxyValue:
+        if "approximate" not in kwargs:
+            return super().call_operator(op, args, kwargs, meta)
+
         if op not in {
             exir_ops.edge.aten.gelu.default,
         }:
@@ -2414,7 +2416,7 @@ class CadenceReplaceOpsInGraph:
         ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass,
         ReplaceAtenAvgPoolWithJarvisAvgPoolPass,
         ReplaceWhereWithFullArgsWithWhereScalar,
-        ReplaceGeluWithApproximateGeluPass,
+        ReplaceAtenApproxGeluWithApproxGeluPass,
         ReplaceSplitWithSlicePass,
         ReplacePowWithMulPass,
     ]
diff --git a/backends/cadence/aot/tests/test_replace_ops_passes.py b/backends/cadence/aot/tests/test_replace_ops_passes.py
@@ -26,13 +26,13 @@
     ForceChannelLastForConvPass,
     MakeSliceAndCatDimOutermostPass,
     ReplaceAddMMWithLinearPass,
+    ReplaceAtenApproxGeluWithApproxGeluPass,
     ReplaceAtenConvolutionWithJarvisConvolutionPass,
     ReplaceConstantPadNdWithSlicePass,
     ReplaceConvolutionOptionalArgsWithConcreteArgsPass,
     ReplaceConvWithIm2RowAndLinear,
     ReplaceEmptyTensorsWithFullPass,
     ReplaceFunctionallyEquivalentOpTargets,
-    ReplaceGeluWithApproximateGeluPass,
     ReplaceIm2RowWithViewPass,
     ReplaceLinearWithFullyConnectedOpPass,
     ReplaceMatmulWithTransposedMatmulPass,
@@ -1287,17 +1287,41 @@ def forward(self, cond: torch.Tensor):
             1,
         )
 
-    def test_replace_aten_gelu_with_approximate_gelu(self):
-        class Gelu(torch.nn.Module):
-            def forward(self, input):
-                return torch.nn.functional.gelu(input)
+    def test_no_replace_aten_gelu_with_approximate_gelu(self):
+        inputs = torch.randn(2, 1, 64)
+
+        gm = single_op_builder(
+            placeholders=(inputs,),
+            op=exir_ops.edge.aten.gelu.default,
+            args=(inputs,),
+        )
+        gm = ExportPass().call(gm).graph_module
+
+        p = ReplaceAtenApproxGeluWithApproxGeluPass()
+        graph_after_passes = p.call(gm).graph_module
 
+        # Assert that aten.gelu op was not decomposed, since it didn't have an approximate argument
+        self.assertEqual(
+            count_node(
+                graph_after_passes,
+                exir_ops.edge.aten.gelu.default,
+            ),
+            1,
+        )
+
+    def test_replace_aten_approximate_gelu_with_approximate_gelu(self):
         inputs = torch.randn(2, 1, 64)
 
-        graph_module = export_to_edge(Gelu(), (inputs,)).exported_program().graph_module
+        gm = single_op_builder(
+            placeholders=(inputs,),
+            op=exir_ops.edge.aten.gelu.default,
+            args=(inputs,),
+            kwargs={"approximate": "tanh"},
+        )
+        gm = ExportPass().call(gm).graph_module
 
-        p = ReplaceGeluWithApproximateGeluPass()
-        graph_after_passes = cast(PassResult, p(graph_module)).graph_module
+        p = ReplaceAtenApproxGeluWithApproxGeluPass()
+        graph_after_passes = p.call(gm).graph_module
 
         # Assert that aten.gelu op was decomposed
         self.assertEqual(
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -3880,6 +3880,41 @@ def test_conv_former(self):
                 self.assertGreaterEqual(msg["top_1"], 60)
                 self.assertGreaterEqual(msg["top_5"], 80)
 
+    def test_deit(self):
+        if not self.required_envs([self.image_dataset]):
+            self.skipTest("missing required envs")
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/deit.py",
+            "--dataset",
+            self.image_dataset,
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--device",
+            self.device,
+            "--model",
+            self.model,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+        ]
+        if self.host:
+            cmds.extend(["--host", self.host])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["top_1"], 75)
+                self.assertGreaterEqual(msg["top_5"], 90)
+
     def test_dino_v2(self):
         if not self.required_envs([self.image_dataset]):
             self.skipTest("missing required envs")
diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
@@ -164,7 +164,7 @@ Llama 3 8B performance was measured on the Samsung Galaxy S22, S24, and OnePlus
 ```
 # No quantization
 # Set these paths to point to the downloaded files
-LLAMA_CHECKPOINT=path/to/checkpoint.pth
+LLAMA_CHECKPOINT=path/to/consolidated.00.pth
 LLAMA_PARAMS=path/to/params.json
 
 python -m examples.models.llama.export_llama \
@@ -186,7 +186,7 @@ For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/exec
 ```
 # SpinQuant
 # Set these paths to point to the exported files
-LLAMA_QUANTIZED_CHECKPOINT=path/to/spinquant/checkpoint.pth
+LLAMA_QUANTIZED_CHECKPOINT=path/to/spinquant/consolidated.00.pth.pth
 LLAMA_PARAMS=path/to/spinquant/params.json
 
 python -m examples.models.llama.export_llama \
@@ -215,7 +215,7 @@ For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co
 ```
 # QAT+LoRA
 # Set these paths to point to the exported files
-LLAMA_QUANTIZED_CHECKPOINT=path/to/qlora/checkpoint.pth
+LLAMA_QUANTIZED_CHECKPOINT=path/to/qlora/consolidated.00.pth.pth
 LLAMA_PARAMS=path/to/qlora/params.json
 
 python -m examples.models.llama.export_llama \
@@ -248,7 +248,7 @@ You can export and run the original Llama 3 8B instruct model.
 2. Export model and generate `.pte` file
     ```
     python -m examples.models.llama.export_llama \
-	    --checkpoint <consolidated.00.pth> \
+	    --checkpoint <consolidated.00.pth.pth> \
 		-p <params.json> \
 		-kv \
 		--use_sdpa_with_kv_cache \
@@ -396,7 +396,7 @@ First export your model for lowbit quantization (step 2 above):
 
 ```
 # Set these paths to point to the downloaded files
-LLAMA_CHECKPOINT=path/to/checkpoint.pth
+LLAMA_CHECKPOINT=path/to/consolidated.00.pth.pth
 LLAMA_PARAMS=path/to/params.json
 
 # Set low-bit quantization parameters
@@ -476,7 +476,7 @@ We use [LM Eval](https://github.com/EleutherAI/lm-evaluation-harness) to evaluat
 For base models, use the following example command to calculate its perplexity based on WikiText.
 ```
 python -m examples.models.llama.eval_llama \
-	-c <checkpoint.pth> \
+	-c <consolidated.00.pth.pth> \
 	-p <params.json> \
 	-t <tokenizer.model/bin> \
 	-kv \
@@ -489,7 +489,7 @@ python -m examples.models.llama.eval_llama \
 For instruct models, use the following example command to calculate its MMLU score.
 ```
 python -m examples.models.llama.eval_llama \
-	-c <checkpoint.pth> \
+	-c <consolidated.00.pth.pth> \
 	-p <params.json> \
 	-t <tokenizer.model/bin> \
 	-kv \
diff --git a/examples/models/qwen3/README.md b/examples/models/qwen3/README.md
@@ -88,4 +88,4 @@ cmake-out/examples/models/llama/llama_main
 To run the model on an example iOS or Android app, see the Llama README's [Step 5: Build Mobile apps](../llama/README.md#step-5-build-mobile-apps) section.
 
 ### FAQ
-For more help with exporting or running this model, feel free to ask in our [discord channel](https://lnkd.in/gWCM4ViK).
+For more help with exporting or running this model, feel free to ask in our [discord channel](https://discord.gg/UEjkY9Zs).
diff --git a/examples/qualcomm/oss_scripts/deit.py b/examples/qualcomm/oss_scripts/deit.py
@@ -0,0 +1,148 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import getpass
+import json
+import os
+from multiprocessing.connection import Client
+
+import numpy as np
+from executorch.backends.qualcomm._passes.qnn_pass_manager import (
+    get_capture_program_passes,
+)
+from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
+from executorch.examples.qualcomm.utils import (
+    build_executorch_binary,
+    get_imagenet_dataset,
+    make_output_dir,
+    parse_skip_delegation_node,
+    setup_common_args_and_variables,
+    SimpleADB,
+    topk_accuracy,
+)
+from transformers import AutoConfig, AutoModelForImageClassification
+
+
+def get_instance():
+    module = (
+        AutoModelForImageClassification.from_pretrained(
+            "facebook/deit-base-distilled-patch16-224"
+        )
+        .eval()
+        .to("cpu")
+    )
+
+    return module
+
+
+def main(args):
+    skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
+
+    os.makedirs(args.artifact, exist_ok=True)
+    config = AutoConfig.from_pretrained("facebook/deit-base-distilled-patch16-224")
+    data_num = 100
+    height = config.image_size
+    width = config.image_size
+    inputs, targets, input_list = get_imagenet_dataset(
+        dataset_path=f"{args.dataset}",
+        data_size=data_num,
+        image_shape=(height, width),
+        crop_size=(height, width),
+    )
+
+    # Get the Deit model.
+    model = get_instance()
+    pte_filename = "deit_qnn"
+
+    # lower to QNN
+    passes_job = get_capture_program_passes()
+    build_executorch_binary(
+        model,
+        inputs[0],
+        args.model,
+        f"{args.artifact}/{pte_filename}",
+        dataset=inputs,
+        skip_node_id_set=skip_node_id_set,
+        skip_node_op_set=skip_node_op_set,
+        quant_dtype=QuantDtype.use_8a8w,
+        passes_job=passes_job,
+        shared_buffer=args.shared_buffer,
+    )
+
+    if args.compile_only:
+        return
+
+    workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/{pte_filename}"
+    pte_path = f"{args.artifact}/{pte_filename}.pte"
+
+    adb = SimpleADB(
+        qnn_sdk=os.getenv("QNN_SDK_ROOT"),
+        build_path=f"{args.build_folder}",
+        pte_path=pte_path,
+        workspace=workspace,
+        device_id=args.device,
+        host_id=args.host,
+        soc_model=args.model,
+    )
+    adb.push(inputs=inputs, input_list=input_list)
+    adb.execute()
+
+    # collect output data
+    output_data_folder = f"{args.artifact}/outputs"
+    make_output_dir(output_data_folder)
+
+    adb.pull(output_path=args.artifact)
+
+    # top-k analysis
+    predictions = []
+    for i in range(data_num):
+        predictions.append(
+            np.fromfile(
+                os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32
+            )
+        )
+
+    k_val = [1, 5]
+    topk = [topk_accuracy(predictions, targets, k).item() for k in k_val]
+    if args.ip and args.port != -1:
+        with Client((args.ip, args.port)) as conn:
+            conn.send(json.dumps({f"top_{k}": topk[i] for i, k in enumerate(k_val)}))
+    else:
+        for i, k in enumerate(k_val):
+            print(f"top_{k}->{topk[i]}%")
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts and output by this example. Default ./deit_qnn",
+        default="./deit_qnn",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=True,
+    )
+
+    args = parser.parse_args()
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)