pytorch
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 63 additions & 0 deletions b/‎.github/workflows/pull.yml‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 0 additions & 62 deletions b/‎.github/workflows/trunk.yml‎
Lines changed: 0 additions & 62 deletions
diff --git a/‎backends/qualcomm/_passes/layout_transform.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/qualcomm/_passes/layout_transform.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/qualcomm/quantizer/annotators.py‎
Lines changed: 4 additions & 2 deletions b/‎backends/qualcomm/quantizer/annotators.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎backends/qualcomm/quantizer/custom_annotation.py‎
Lines changed: 7 additions & 4 deletions b/‎backends/qualcomm/quantizer/custom_annotation.py‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎backends/qualcomm/scripts/build.sh‎
Lines changed: 8 additions & 0 deletions b/‎backends/qualcomm/scripts/build.sh‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎backends/qualcomm/tests/test_qnn_delegate.py‎
Lines changed: 61 additions & 2 deletions b/‎backends/qualcomm/tests/test_qnn_delegate.py‎
Lines changed: 61 additions & 2 deletions
diff --git a/‎backends/vulkan/runtime/graph/ComputeGraph.h‎
Lines changed: 15 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ComputeGraph.h‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl‎
Lines changed: 14 additions & 5 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl‎
Lines changed: 14 additions & 5 deletions
@@ -762,3 +762,66 @@ jobs:
 
         # Test selective build
         PYTHON_EXECUTABLE=python bash examples/wasm/test_build_wasm.sh
+
+  unittest-nxp-neutron:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-clang12
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        # Build and install Executorch
+        PYTHON_EXECUTABLE=python \
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_NXP_NEUTRON=ON" \
+        .ci/scripts/setup-linux.sh --build-tool "cmake"
+
+        # Install test requirements
+        pip install -r backends/nxp/requirements-tests.txt
+
+        # Run pytest
+        PYTHON_EXECUTABLE=python bash backends/nxp/run_unittests.sh
+                
+        # Run aot example: 
+        PYTHON_EXECUTABLE=python bash examples/nxp/run_aot_example.sh
+
+
+  nxp-build-test:
+    name: nxp-build-test
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-arm-sdk
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        # Build
+        cmake -DEXECUTORCH_BUILD_NXP_NEUTRON=ON -Bcmake-out .
+        cmake --build cmake-out --target executorch_delegate_neutron --config Release
+
+        # Build check for the neutron backend library
+        lib_neutron="cmake-out/backends/nxp/libexecutorch_delegate_neutron.a"
+        if [ -f $lib_neutron ]; then
+            echo "Neutron backend library built."
+        else
+            echo "Neutron backend library not found!"
+            exit 1
+        fi
@@ -302,36 +302,6 @@ jobs:
           exit 1
         fi
 
-  nxp-build-test:
-    name: nxp-build-test
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    permissions:
-      id-token: write
-      contents: read
-    with:
-      runner: linux.2xlarge
-      docker-image: executorch-ubuntu-22.04-arm-sdk
-      submodules: 'recursive'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        # Build
-        cmake -DEXECUTORCH_BUILD_NXP_NEUTRON=ON -Bcmake-out .
-        cmake --build cmake-out --target executorch_delegate_neutron --config Release
-
-        # Build check for the neutron backend library
-        lib_neutron="cmake-out/backends/nxp/libexecutorch_delegate_neutron.a"
-        if [ -f $lib_neutron ]; then
-            echo "Neutron backend library built."
-        else
-            echo "Neutron backend library not found!"
-            exit 1
-        fi
-
   test-coreml-delegate:
     name: test-coreml-delegate
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
@@ -771,35 +741,3 @@ jobs:
       build-mode: Release
       build-tool: cmake
       docker-image: executorch-ubuntu-22.04-clang12
-
-  unittest-nxp-neutron:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    permissions:
-      id-token: write
-      contents: read
-    with:
-      runner: linux.2xlarge
-      docker-image: executorch-ubuntu-22.04-clang12
-      submodules: 'recursive'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        set -eux
-
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        # Build and install Executorch
-        PYTHON_EXECUTABLE=python \
-        CMAKE_ARGS="-DEXECUTORCH_BUILD_NXP_NEUTRON=ON" \
-        .ci/scripts/setup-linux.sh --build-tool "cmake"
-
-        # Install test requirements
-        pip install -r backends/nxp/requirements-tests.txt
-
-        # Run pytest
-        PYTHON_EXECUTABLE=python bash backends/nxp/run_unittests.sh
-        
-        # Run aot example: 
-        PYTHON_EXECUTABLE=python bash examples/nxp/run_aot_example.sh
@@ -101,8 +101,8 @@ class LayoutTransform(ExportPass):
         exir_ops.edge.aten.pow.Tensor_Scalar,
         exir_ops.edge.aten.prelu.default,
         exir_ops.edge.aten.repeat.default,
-        exir_ops.edge.aten.round.default,
         exir_ops.edge.aten.relu.default,
+        exir_ops.edge.aten.round.default,
         exir_ops.edge.aten.sigmoid.default,
         exir_ops.edge.aten.split_with_sizes.default,
         exir_ops.edge.aten.split_with_sizes_copy.default,
 
@@ -275,7 +275,9 @@ def annotate_masked_fill(node: Node, quantization_config: QuantizationConfig) ->
     )
 
 
-@register_annotator([torch.ops.aten.mul, torch.ops.aten.mul.Tensor])
+@register_annotator(
+    [torch.ops.aten.mul, torch.ops.aten.mul.Tensor, torch.ops.aten.mul_.Tensor]
+)
 def annotate_mul(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_binary(node, quantization_config)
 
@@ -1298,7 +1300,7 @@ def annotate_where(node: Node, quantization_config: QuantizationConfig) -> None:
     )
 
 
-@register_annotator([torch.ops.aten.zeros.default])
+@register_annotator([torch.ops.aten.zeros.default, torch.ops.aten.zeros_like.default])
 def annotate_zeros(node: Node, quantization_config: QuantizationConfig) -> None:
     if _is_annotated([node]) or not _is_float_tensor(node):
         return
 
@@ -153,7 +153,9 @@ def annotate_prefill_kv_output(gm: torch.fx.GraphModule, kv_quant_attrs: dict):
                 )
 
 
-def annotate_matmul_16a8w(gm: torch.fx.GraphModule) -> None:  # noqa: C901
+def annotate_matmul_16a8w(  # noqa: C901
+    gm: torch.fx.GraphModule, annotate_conv=True
+) -> None:
     """
     This function is specific for matmul op 16a8w.
     For k, we will tag such as the below, and
@@ -317,9 +319,10 @@ def annotate_matmul_input1(node: Node):
                 # The arguments of cat op: (the past kv cache, the new kv cache)
                 node = node.args[0][1]
             elif node.target == torch.ops.aten.conv2d.default:
-                annotate_conv2d(
-                    node, quantization_config=quantization_config_8a4w_per_channel
-                )
+                if annotate_conv:
+                    annotate_conv2d(
+                        node, quantization_config=quantization_config_8a4w_per_channel
+                    )
                 break
             elif node.target in [torch.ops.aten.add.Tensor, torch.ops.aten.sub.Tensor]:
                 break
 
@@ -85,6 +85,7 @@ if [ "$BUILD_AARCH64" = true ]; then
         -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
+        -DEXECUTORCH_ENABLE_LOGGING=ON \
         -DQNN_SDK_ROOT=$QNN_SDK_ROOT \
         -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
         -DANDROID_ABI='arm64-v8a' \
@@ -104,6 +105,9 @@ if [ "$BUILD_AARCH64" = true ]; then
         -DANDROID_ABI='arm64-v8a' \
         -DANDROID_PLATFORM=android-30 \
         -DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
+        -DSUPPORT_REGEX_LOOKAHEAD=ON \
+        -DBUILD_TESTING=OFF \
+        -DEXECUTORCH_ENABLE_LOGGING=ON \
         -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
         -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
         -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
@@ -134,6 +138,7 @@ if [ "$BUILD_X86_64" = true ]; then
         -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
+        -DEXECUTORCH_ENABLE_LOGGING=ON \
         -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
         -S $PRJ_ROOT \
         -B $BUILD_ROOT \
@@ -157,6 +162,9 @@ if [ "$BUILD_X86_64" = true ]; then
        -DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
        -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
        -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
+       -DSUPPORT_REGEX_LOOKAHEAD=ON \
+       -DBUILD_TESTING=OFF \
+       -DEXECUTORCH_ENABLE_LOGGING=ON \
        -B$EXAMPLE_ROOT
 
    cmake --build $EXAMPLE_ROOT -j$BUILD_JOB_NUMBER
 
@@ -3999,7 +3999,7 @@ def test_llama3_2_1b(self):
             "16a4w",
             "--temperature",
             "0",
-            "--llama_model",
+            "--decoder_model",
             "llama3_2",
             "--model_mode",
             "hybrid",
@@ -4079,7 +4079,7 @@ def test_llama_stories_110m(self):
             "16a4w",
             "--temperature",
             "0",
-            "--llama_model",
+            "--decoder_model",
             "stories110m",
             "--model_mode",
             "hybrid",
@@ -4121,6 +4121,65 @@ def test_llama_stories_110m(self):
                 if not self.compile_only and not self.enable_x86_64:
                     self.assertGreaterEqual(msg["inference_speed"], 220)  # Lanai
 
+    def test_qwen2_5(self):
+        if not self.required_envs():
+            self.skipTest("missing required envs")
+
+        prompt = "My favourite condiment is "
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--model",
+            self.model,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+            "--prompt",
+            f"{prompt}",
+            "--ptq",
+            "16a8w",
+            "--decoder_model",
+            "qwen2_5",
+            "--model_mode",
+            "hybrid",
+            "--prefill_ar_len",
+            "32",
+            "--max_seq_len",
+            "128",
+        ]
+        if self.compile_only:
+            cmds.extend(["--compile_only"])
+        elif self.device:
+            cmds.extend(["--device", self.device])
+        if self.host:
+            cmds.extend(["--host", self.host])
+        elif self.enable_x86_64:
+            cmds.extend(["--enable_x86_64"])
+        if self.pre_gen_pte:
+            cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
+
+        # Accuracy is bad for now. Just check user's prompt is returned.
+        golden_start_with = "My favourite condiment is "
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                model_out = msg["result"][0]
+                self.assertTrue(
+                    model_out.startswith(golden_start_with),
+                    f"Expected Output: {golden_start_with}. Actual Output: {model_out}",
+                )
+                self.assertGreaterEqual(msg["inference_speed"], 95)  # Lanai
+
 
 class TestExampleOssScript(TestQNN):
     def test_albert(self):
 
@@ -424,6 +424,12 @@ class ComputeGraph final {
   // Scalar Value Extraction
   //
 
+  bool is_scalar_or_none(const ValueRef idx) const {
+    const Value& value = values_.at(idx);
+    return value.isInt() || value.isDouble() || value.isBool() ||
+        value.isNone();
+  }
+
   template <typename T>
   T extract_scalar(const ValueRef idx) {
     Value& value = values_.at(idx);
@@ -439,6 +445,15 @@ class ComputeGraph final {
     VK_THROW("Cannot extract scalar from Value with type ", value.type());
   }
 
+  template <typename T>
+  T extract_scalar_or(const ValueRef idx, const T default_value) {
+    Value& value = values_.at(idx);
+    if (value.isNone()) {
+      return default_value;
+    }
+    return extract_scalar<T>(idx);
+  }
+
   template <typename T>
   std::optional<T> extract_optional_scalar(const ValueRef idx) {
     if (val_is_none(idx)) {
 
@@ -9,6 +9,7 @@
 #version 450 core
 
 #define PRECISION ${PRECISION}
+#define UBO_PARAMS ${UBO_PARAMS}
 
 #define VEC4_T ${texel_type(DTYPE)}
 #define T ${buffer_scalar_type(DTYPE)}
@@ -22,19 +23,27 @@ layout(std430) buffer;
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
 ${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")}
 
-$if OP_NAME == "slice":
-  ${layout_declare_ubo(B, "int", "start")}
-  ${layout_declare_ubo(B, "int", "step")}
+$if UBO_PARAMS:
+  $if OP_NAME == "slice":
+    ${layout_declare_ubo(B, "int", "start")}
+    ${layout_declare_ubo(B, "int", "step")}
 
-$if OP_NAME == "select":
-  ${layout_declare_ubo(B, "int", "index")}
+  $if OP_NAME == "select":
+    ${layout_declare_ubo(B, "int", "index")}
 
 layout(push_constant) uniform restrict Block {
   ivec4 in_sizes;
   ivec4 out_strides;
   ivec4 in_strides;
   int out_numel;
   int selected_dim;
+  $if not UBO_PARAMS:
+    $if OP_NAME == "slice":
+      int start;
+      int step;
+
+    $if OP_NAME == "select":
+      int index;
 };
 
 ${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
Original file line number	Diff line number	Diff line change
`@@ -275,7 +275,9 @@ def annotate_masked_fill(node: Node, quantization_config: QuantizationConfig) ->`
`275`	`275`	`)`
`276`	`276`
`277`	`277`
`278`		`-@register_annotator([torch.ops.aten.mul, torch.ops.aten.mul.Tensor])`
	`278`	`+@register_annotator(`
	`279`	`+ [torch.ops.aten.mul, torch.ops.aten.mul.Tensor, torch.ops.aten.mul_.Tensor]`
	`280`	`+)`
`279`	`281`	`def annotate_mul(node: Node, quantization_config: QuantizationConfig) -> None:`
`280`	`282`	`annotate_binary(node, quantization_config)`
`281`	`283`
`@@ -1298,7 +1300,7 @@ def annotate_where(node: Node, quantization_config: QuantizationConfig) -> None:`
`1298`	`1300`	`)`
`1299`	`1301`
`1300`	`1302`
`1301`		`-@register_annotator([torch.ops.aten.zeros.default])`
	`1303`	`+@register_annotator([torch.ops.aten.zeros.default, torch.ops.aten.zeros_like.default])`
`1302`	`1304`	`def annotate_zeros(node: Node, quantization_config: QuantizationConfig) -> None:`
`1303`	`1305`	`if _is_annotated([node]) or not _is_float_tensor(node):`
`1304`	`1306`	`return`