Merge branch 'main' into jz/cpp-no-op-tokenizer

jackzhxng · jackzhxng · commit 73aeb21bc125 · 2025-04-29T20:32:12.000-07:00
diff --git a/.github/workflows/_link_check.yml b/.github/workflows/_link_check.yml
@@ -0,0 +1,41 @@
+on:
+  workflow_call:
+    inputs:
+      ref:
+        type: string
+        required: true
+
+jobs:
+  lint-urls:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-linter
+      submodules: 'none'
+      fetch-depth: 0
+      ref: ${{ inputs.ref }}
+      timeout: 90
+      script: |
+        ./scripts/lint_urls.sh $(
+          [ "${{ github.event_name }}" = "pull_request" ] \
+            && git diff --name-only ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }} \
+          || [ "${{ github.event_name }}" = "push" ] \
+            && git diff --name-only ${{ github.event.before }} ${{ github.sha }}
+        )
+
+  lint-xrefs:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-linter
+      submodules: 'none'
+      fetch-depth: 0
+      ref: ${{ inputs.ref }}
+      timeout: 90
+      script: |
+        ./scripts/lint_xrefs.sh $(
+          [ "${{ github.event_name }}" = "pull_request" ] \
+            && git diff --name-only ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }} \
+          || [ "${{ github.event_name }}" = "push" ] \
+            && git diff --name-only ${{ github.event.before }} ${{ github.sha }}
+        )
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -64,29 +64,10 @@ jobs:
 
         exit $RC
 
-  lint-urls:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+  link-check:
+    uses: ./.github/workflows/_link_check.yml
     with:
-      runner: linux.2xlarge
-      docker-image: executorch-ubuntu-22.04-linter
-      submodules: 'none'
-      fetch-depth: 0
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        ./scripts/lint_urls.sh
-
-  lint-xrefs:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    with:
-      runner: linux.2xlarge
-      docker-image: executorch-ubuntu-22.04-linter
-      submodules: 'none'
-      fetch-depth: 0
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        ./scripts/lint_xrefs.sh
 
   android-java-format:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
@@ -30,3 +30,9 @@ jobs:
           test-infra-ref: main
           updatebot-token: ${{ secrets.UPDATEBOT_TOKEN }}
           pytorchbot-token: ${{ secrets.GH_PYTORCHBOT_TOKEN }}
+
+  link-check:
+    needs: update-pytorch-commit-hash
+    uses: ./.github/workflows/_link_check.yml
+    with:
+      ref: ${{ github.sha }}
diff --git a/README.md b/README.md
@@ -51,7 +51,7 @@ To get started you can:
 
 - Visit the [Step by Step Tutorial](https://pytorch.org/executorch/stable/getting-started.html) to get things running locally and deploy a model to a device
 - Use this [Colab Notebook](https://colab.research.google.com/drive/1qpxrXC3YdJQzly3mRg-4ayYiOjC6rue3?usp=sharing) to start playing around right away
-- Jump straight into LLM use cases by following specific instructions for [Llama](examples/models/llama/README.md) and [Llava](examples/models/llava/README.md)
+- Jump straight into LLM use cases by following specific instructions for popular open-source models such as [Llama](examples/models/llama/README.md), [Qwen 3](examples/models/qwen3/README.md), [Phi-4-mini](examples/models/phi_4_mini/README.md), and [Llava](examples/models/llava/README.md)
 
 ## Feedback and Engagement
 
diff --git a/backends/apple/coreml/runtime/test/ETCoreMLModelManagerTests.mm b/backends/apple/coreml/runtime/test/ETCoreMLModelManagerTests.mm
@@ -15,6 +15,9 @@
 #import <XCTest/XCTest.h>
 #import <executorch/runtime/platform/runtime.h>
 #import <model_logging_options.h>
+#import <multiarray.h>
+
+using namespace executorchcoreml;
 
 @interface ETCoreMLModelManagerTests : XCTestCase
 
@@ -148,4 +151,77 @@ - (void)testMulModelExecution {
     }
 }
 
+// See https://github.com/pytorch/executorch/pull/10465
+- (void)testAutoreleasepoolError {
+    NSURL *modelURL = [self.class bundledResourceWithName:@"add_coreml_all" extension:@"bin"];
+    NSError *localError = nil;
+    XCTAssertNotNil(modelURL);
+
+    NSData *modelData = [NSData dataWithContentsOfURL:modelURL];
+    MLModelConfiguration *configuration = [[MLModelConfiguration alloc] init];
+    configuration.computeUnits = MLComputeUnitsAll;
+    ModelHandle *modelHandle = [self.modelManager loadModelFromAOTData:modelData
+                                                           configuration:configuration
+                                                                   error:&localError];
+    XCTAssert(modelHandle);
+
+    ETCoreMLModel *model = [self.modelManager modelWithHandle:modelHandle];
+    XCTAssert(model);
+
+    NSArray<MLMultiArray *> *inputArrays =
+        [ETCoreMLTestUtils inputsForModel:model repeatedValues:@[@(2), @(3)] error:&localError];
+    XCTAssert(inputArrays);
+
+    std::vector<MultiArray> multiArrays;
+    multiArrays.reserve(inputArrays.count + model.orderedOutputNames.count);
+    for (MLMultiArray *array in inputArrays) {
+        auto dataTypeOpt = to_multiarray_data_type(array.dataType);
+        XCTAssert(dataTypeOpt.has_value());
+        auto dataType = dataTypeOpt.value();
+
+        std::vector<size_t> dims;
+        for (NSNumber *n in array.shape) {
+            dims.push_back(n.unsignedLongValue);
+        }
+
+        std::vector<ssize_t> strides(dims.size());
+        ssize_t currentStride = 1;
+        for (NSInteger i = dims.size() - 1; i >= 0; --i) {
+            strides[i] = currentStride;
+            currentStride *= dims[i];
+        }
+
+        multiArrays.emplace_back(array.dataPointer,
+                                 MultiArray::MemoryLayout(dataType, dims, strides));
+    }
+
+    auto inputLayout = multiArrays[0].layout();
+    size_t bufferSize = inputLayout.num_bytes();
+    for (NSUInteger i = 0; i < model.orderedOutputNames.count; ++i) {
+        multiArrays.emplace_back(calloc(1, bufferSize), inputLayout);
+    }
+    // corrupt first input shape to force error
+    {
+        auto originalLayout = multiArrays[0].layout();
+        auto corruptedDims = originalLayout.shape();
+        corruptedDims[0] += 1;
+        multiArrays[0] = MultiArray(multiArrays[0].data(),
+                                    MultiArray::MemoryLayout(originalLayout.dataType(),
+                                                             corruptedDims,
+                                                             originalLayout.strides()));
+    }
+
+    BOOL success = [self.modelManager executeModelWithHandle:modelHandle
+                                                    argsVec:multiArrays
+                                             loggingOptions:ModelLoggingOptions()
+                                                eventLogger:nullptr
+                                                      error:&localError];
+    XCTAssertFalse(success);
+    XCTAssertNotNil(localError);
+
+    for (size_t i = inputArrays.count; i < multiArrays.size(); ++i) {
+        free(multiArrays[i].data());
+    }
+}
+
 @end
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -18,9 +18,9 @@
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
-import distutils.file_util
 import glob
 import os
+import shutil
 import sys
 from typing import Any
 
@@ -135,7 +135,7 @@
     # Copy .md files from source dir to gallery dir
     for f in glob.glob(os.path.join(source_dir, "*.md")):
 
-        distutils.file_util.copy_file(f, gallery_dir, update=True)
+        shutil.copyfile(f, gallery_dir)
 
 source_suffix = [".rst", ".md"]
 
diff --git a/examples/models/llama/attention.py b/examples/models/llama/attention.py
@@ -178,6 +178,7 @@ def __init__(self, args: ModelArgs, layer_id: int, rope: Rope):
         self.dim = args.dim
         self.attention_qkv_bias = args.attention_qkv_bias
         self.use_qk_norm = args.use_qk_norm
+        self.qk_norm_before_rope = args.qk_norm_before_rope
 
         if self.use_qk_norm:
             q_norm_dim = self.head_dim
@@ -243,14 +244,18 @@ def forward(
         k = k.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
         v = v.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
 
+        if self.use_qk_norm and self.qk_norm_before_rope:
+            q = self.q_norm_fn(q)
+            k = self.k_norm_fn(k)
+
         # RoPE relative positional embeddings
         q, k = self.rope.forward(q, k, freqs_cos, freqs_sin)
 
         q = q.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
         k = k.transpose(1, 2)
         v = v.transpose(1, 2)
 
-        if self.use_qk_norm:
+        if self.use_qk_norm and not self.qk_norm_before_rope:
             q = self.q_norm_fn(q)
             k = self.k_norm_fn(k)
 
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -100,6 +100,9 @@
     "llama3_2",
     "static_llama",
     "qwen2_5",
+    "qwen3-0_6b",
+    "qwen3-1_7b",
+    "qwen3-4b",
     "phi_4_mini",
     "smollm2",
 ]
@@ -108,6 +111,9 @@
     "qwen2_5": "Qwen/Qwen2.5-1.5B",
     "phi_4_mini": "microsoft/Phi-4-mini-instruct",
     "smollm2": "HuggingFaceTB/SmolLM-135M",
+    "qwen3-0_6b": "Qwen/Qwen3-0.6B",
+    "qwen3-1_7b": "Qwen/Qwen3-1.7B",
+    "qwen3-4b": "Qwen/Qwen3-4B",
 }
 
 
@@ -544,6 +550,10 @@ def export_llama(args) -> str:
             from executorch.examples.models.qwen2_5 import (  # pyre-ignore[21]
                 convert_weights,
             )
+        elif args.model.startswith("qwen3"):
+            from executorch.examples.models.qwen3 import (  # pyre-ignore[21]
+                convert_weights,
+            )
         elif args.model == "phi_4_mini":
             from executorch.examples.models.phi_4_mini import (  # pyre-ignore[21]
                 convert_weights,
diff --git a/examples/models/llama/model_args.py b/examples/models/llama/model_args.py
@@ -38,6 +38,7 @@ class ModelArgs:
     apply_embedding: bool = True  # Use embedding inside the transformer
     apply_output: bool = True  # Use output layer (unembedding) inside the transformer
     use_qk_norm: bool = False  # apply normalization to q and k in the attention
+    qk_norm_before_rope: bool = False  # when to apply qk norm
     use_hf_rope: bool = False  # Use HuggingFace's RoPE implementation
     partial_rotary_factor: float = 1.0
     rope_theta: Optional[float] = (
diff --git a/examples/models/qwen3/0_6b_config.json b/examples/models/qwen3/0_6b_config.json
@@ -0,0 +1,17 @@
+{
+  "dim": 1024,
+  "ffn_dim_multiplier": 1,
+  "hidden_dim": 3072,
+  "n_heads": 16,
+  "head_dim": 128,
+  "n_kv_heads": 8,
+  "n_layers": 28,
+  "norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "use_scaled_rope": false,
+  "vocab_size": 151936,
+  "use_hf_rope": true,
+  "attention_qkv_bias": false,
+  "use_qk_norm": true,
+  "qk_norm_before_rope": true
+}
diff --git a/examples/models/qwen3/1_7b_config.json b/examples/models/qwen3/1_7b_config.json
@@ -0,0 +1,17 @@
+{
+  "dim": 2048,
+  "ffn_dim_multiplier": 1,
+  "hidden_dim": 6144,
+  "n_heads": 16,
+  "head_dim": 128,
+  "n_kv_heads": 8,
+  "n_layers": 28,
+  "norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "use_scaled_rope": false,
+  "vocab_size": 151936,
+  "use_hf_rope": true,
+  "attention_qkv_bias": false,
+  "use_qk_norm": true,
+  "qk_norm_before_rope": true
+}
diff --git a/examples/models/qwen3/4b_config.json b/examples/models/qwen3/4b_config.json
@@ -0,0 +1,17 @@
+{
+  "dim": 2560,
+  "ffn_dim_multiplier": 1,
+  "hidden_dim": 9728,
+  "n_heads": 32,
+  "head_dim": 128,
+  "n_kv_heads": 8,
+  "n_layers": 36,
+  "norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "use_scaled_rope": false,
+  "vocab_size": 151936,
+  "use_hf_rope": true,
+  "attention_qkv_bias": false,
+  "use_qk_norm": true,
+  "qk_norm_before_repo": true
+}
diff --git a/examples/models/qwen3/README.md b/examples/models/qwen3/README.md
diff --git a/examples/models/qwen3/__init__.py b/examples/models/qwen3/__init__.py
diff --git a/examples/models/qwen3/convert_weights.py b/examples/models/qwen3/convert_weights.py
diff --git a/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj b/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj
diff --git a/scripts/lint_urls.sh b/scripts/lint_urls.sh
diff --git a/scripts/lint_xrefs.sh b/scripts/lint_xrefs.sh