Add Static Stories Llama CI

winskuo-quic · winskuo-quic · commit 376045890d26 · 2025-02-03T10:23:04.000+08:00
diff --git a/.ci/scripts/setup-stories-llama.sh b/.ci/scripts/setup-stories-llama.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -ex
+
+# Download and prepare stories llama model artifacts
+prepare_model_artifacts() {
+    echo "Preparing stories model artifacts"
+    wget -O stories110M.pt "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt"
+    wget -O tokenizer.model "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model"
+    echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
+}
+
+prepare_model_artifacts
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -440,6 +440,31 @@ jobs:
         # Test llama2
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"
 
+  test-static-llama-runner-qnn-linux:
+    name: test-static-llama-runner-qnn-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    strategy:
+      fail-fast: false
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-qnn-sdk
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 900
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
+        PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
+
+        # Retrieve 110M Stories Llama Artifacts
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-stories-llama.sh
+        
+        # Test static llama stories110m
+        PYTHON_EXECUTABLE=python backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --compile_only"
+
   test-qnn-models-linux:
     name: test-qnn-models-linux
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -3586,8 +3586,6 @@ def test_stories_single_llama(self):
             self.artifact_dir,
             "--build_folder",
             self.build_folder,
-            "--device",
-            self.device,
             "--model",
             self.model,
             "--checkpoint",
@@ -3610,7 +3608,17 @@ def test_stories_single_llama(self):
             "0",
             "--llama_model",
             "stories110m",
+            "--model_mode",
+            "hybrid",
+            "--prefill_seq_len",
+            "32",
+            "--kv_seq_len",
+            "128",
         ]
+        if self.compile_only:
+            cmds.extend(["--compile_only"])
+        else:
+            cmds.extend(["--device", self.device])
         if self.host:
             cmds.extend(["--host", self.host])
 
@@ -3623,8 +3631,11 @@ def test_stories_single_llama(self):
             if "Error" in msg:
                 self.fail(msg["Error"])
             else:
-                model_out = msg["result"][0]
-                self.assertTrue(model_out.startswith(golden_start_with))
+                if not self.compile_only:
+                    model_out = msg["result"][0]
+                    self.assertTrue(model_out.startswith(golden_start_with))
+                pte_size = msg["pte_size"]
+                self.assertLessEqual(pte_size, 130000000)
 
     @unittest.skip("dynamic shape inputs appear in recent torch.export.export")
     def test_mobilebert(self):
@@ -3853,6 +3864,8 @@ def setup_environment():
     TestQNN.shared_buffer = args.shared_buffer
     TestQNN.enable_x86_64 = args.enable_x86_64
     TestQNN.dump_intermediate_outputs = args.dump_intermediate_outputs
+    TestQNN.compile_only = args.compile_only
+
     return sys.argv[:1] + ns_args
 
 
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
@@ -131,6 +131,7 @@ class TestQNN(unittest.TestCase):
     use_16a4w: str = "16a4w"
     shared_buffer: bool = False
     enable_x86_64: bool = False
+    compile_only: bool = False
 
     def _assert_outputs_equal(self, model_output, ref_output):
         self.assertTrue(len(ref_output) == len(model_output))
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -804,13 +804,14 @@ def post_process():
             outputs.append(f.read())
 
     adb.pull(output_path=args.artifact, callback=post_process)
-
     if args.ip and args.port != -1:
+        pte_size = os.path.getsize(pte_path)
         with Client((args.ip, args.port)) as conn:
             conn.send(
                 json.dumps(
                     {
                         "result": outputs,
+                        "pte_size": pte_size,
                     }
                 )
             )
@@ -1007,6 +1008,18 @@ def main():
             )
         else:
             logging.warning("Quant attributes of the logit is None.")
+
+        if args.ip and args.port != -1:
+            pte_path = f"{args.artifact}/{pte_filename}.pte"
+            pte_size = os.path.getsize(pte_path)
+            with Client((args.ip, args.port)) as conn:
+                conn.send(
+                    json.dumps(
+                        {
+                            "pte_size": pte_size,
+                        }
+                    )
+                )
         exit(f"Finish compile_only and save to {args.artifact}")
 
     try: