pytorch
diff --git a/‎.ci/scripts/build_android_instrumentation.sh‎
Lines changed: 6 additions & 26 deletions b/‎.ci/scripts/build_android_instrumentation.sh‎
Lines changed: 6 additions & 26 deletions
diff --git a/‎.ci/scripts/utils.sh‎
Lines changed: 39 additions & 5 deletions b/‎.ci/scripts/utils.sh‎
Lines changed: 39 additions & 5 deletions
diff --git a/‎.github/workflows/_android.yml‎
Lines changed: 12 additions & 7 deletions b/‎.github/workflows/_android.yml‎
Lines changed: 12 additions & 7 deletions
diff --git a/‎.github/workflows/_unittest.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/_unittest.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/models/llama/README.md‎
Lines changed: 4 additions & 0 deletions b/‎examples/models/llama/README.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/models/llama/export_llama_lib.py‎
Lines changed: 30 additions & 13 deletions b/‎examples/models/llama/export_llama_lib.py‎
Lines changed: 30 additions & 13 deletions
diff --git a/‎examples/models/llama/llama_test.py‎
Lines changed: 0 additions & 36 deletions b/‎examples/models/llama/llama_test.py‎
Lines changed: 0 additions & 36 deletions
diff --git a/‎examples/models/llama/source_transformation/quantize.py‎
Lines changed: 33 additions & 11 deletions b/‎examples/models/llama/source_transformation/quantize.py‎
Lines changed: 33 additions & 11 deletions
diff --git a/‎examples/models/phi-4-mini/README.md‎
Lines changed: 63 additions & 0 deletions b/‎examples/models/phi-4-mini/README.md‎
Lines changed: 63 additions & 0 deletions
@@ -12,30 +12,10 @@ if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
 fi
 which "${PYTHON_EXECUTABLE}"
 
-build_android_test() {
-  mkdir -p extension/android/executorch_android/src/androidTest/resources
-  cp extension/module/test/resources/add.pte extension/android/executorch_android/src/androidTest/resources
-  pushd extension/android
-  ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:testDebugUnitTest
-  ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:assembleAndroidTest
-  popd
-}
+mkdir -p "${BUILD_AAR_DIR}"/executorch_android/src/androidTest/resources
+cp extension/module/test/resources/add.pte "${BUILD_AAR_DIR}"/executorch_android/src/androidTest/resources
 
-collect_artifacts_to_be_uploaded() {
-  ARTIFACTS_DIR_NAME="$1"
-  # Collect Java library test
-  JAVA_LIBRARY_TEST_DIR="${ARTIFACTS_DIR_NAME}/library_test_dir"
-  mkdir -p "${JAVA_LIBRARY_TEST_DIR}"
-  cp extension/android/executorch_android/build/outputs/apk/androidTest/debug/*.apk "${JAVA_LIBRARY_TEST_DIR}"
-}
-
-main() {
-  build_android_test
-  if [ -n "$ARTIFACTS_DIR_NAME" ]; then
-    collect_artifacts_to_be_uploaded ${ARTIFACTS_DIR_NAME}
-  fi
-}
-
-if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
-  main "$@"
-fi
+pushd "${BUILD_AAR_DIR}"
+ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:testDebugUnitTest
+ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:assembleAndroidTest
+popd
@@ -60,12 +60,46 @@ install_pytorch_and_domains() {
   # Fetch the target commit
   pushd pytorch || return
   git checkout "${TORCH_VERSION}"
-  git submodule update --init --recursive
 
-  export USE_DISTRIBUTED=1
-  # Then build and install PyTorch
-  python setup.py bdist_wheel
-  pip install "$(echo dist/*.whl)"
+  local system_name=$(uname)
+  if [[ "${system_name}" == "Darwin" ]]; then
+    local platform=$(python -c 'import sysconfig; import platform; v=platform.mac_ver()[0].split(".")[0]; platform=sysconfig.get_platform().split("-"); platform[1]=f"{v}_0"; print("_".join(platform))')
+  fi
+  local python_version=$(python -c 'import platform; v=platform.python_version_tuple(); print(f"{v[0]}{v[1]}")')
+  local torch_release=$(cat version.txt)
+  local torch_short_hash=${TORCH_VERSION:0:7}
+  local torch_wheel_path="cached_artifacts/pytorch/executorch/pytorch_wheels/${system_name}/${python_version}"
+  local torch_wheel_name="torch-${torch_release}%2Bgit${torch_short_hash}-cp${python_version}-cp${python_version}-${platform:-}.whl"
+
+  local cached_torch_wheel="https://gha-artifacts.s3.us-east-1.amazonaws.com/${torch_wheel_path}/${torch_wheel_name}"
+  # Cache PyTorch wheel is only needed on MacOS, Linux CI already has this as part
+  # of the Docker image
+  local torch_wheel_not_found=0
+  if [[ "${system_name}" == "Darwin" ]]; then
+    pip install "${cached_torch_wheel}" || torch_wheel_not_found=1
+  else
+    torch_wheel_not_found=1
+  fi
+
+  # Found no such wheel, we will build it from source then
+  if [[ "${torch_wheel_not_found}" == "1" ]]; then
+    echo "No cached wheel found, continue with building PyTorch at ${TORCH_VERSION}"
+
+    git submodule update --init --recursive
+    USE_DISTRIBUTED=1 python setup.py bdist_wheel
+    pip install "$(echo dist/*.whl)"
+
+    # Only AWS runners have access to S3
+    if command -v aws && [[ -z "${GITHUB_RUNNER:-}" ]]; then
+      for wheel_path in dist/*.whl; do
+        local wheel_name=$(basename "${wheel_path}")
+        echo "Caching ${wheel_name}"
+        aws s3 cp "${wheel_path}" "s3://gha-artifacts/${torch_wheel_path}/${wheel_name}"
+      done
+    fi
+  else
+    echo "Use cached wheel at ${cached_torch_wheel}"
+  fi
 
   # Grab the pinned audio and vision commits from PyTorch
   TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)
 
@@ -27,16 +27,21 @@ jobs:
         conda activate "${CONDA_ENV}"
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool buck2
         export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
-
-        mkdir -p ${ARTIFACTS_DIR_NAME}/fp32-xnnpack-custom
-        bash examples/models/llama/install_requirements.sh
-        bash ".ci/scripts/test_llama.sh" -model stories110M -build_tool cmake -dtype fp16 -mode portable -upload ${ARTIFACTS_DIR_NAME}/fp32-xnnpack-custom
+        mkdir -p ${ARTIFACTS_DIR_NAME}/
 
         # Build LLM Demo for Android
         export BUILD_AAR_DIR=aar-out
         mkdir -p $BUILD_AAR_DIR
-        bash scripts/build_android_library.sh ${ARTIFACTS_DIR_NAME}
-        bash .ci/scripts/build_android_instrumentation.sh ${ARTIFACTS_DIR_NAME}
+        bash scripts/build_android_library.sh
+        cp ${BUILD_AAR_DIR}/executorch.aar $ARTIFACTS_DIR_NAME
+
+        mkdir -p ${ARTIFACTS_DIR_NAME}/library_test_dir
+        bash .ci/scripts/build_android_instrumentation.sh
+        cp ${BUILD_AAR_DIR}/executorch_android/build/outputs/apk/androidTest/debug/executorch_android-debug-androidTest.apk "${ARTIFACTS_DIR_NAME}/library_test_dir"
+
+        mkdir -p ${ARTIFACTS_DIR_NAME}/fp32-xnnpack-custom
+        bash examples/models/llama/install_requirements.sh
+        bash ".ci/scripts/test_llama.sh" -model stories110M -build_tool cmake -dtype fp16 -mode portable -upload ${ARTIFACTS_DIR_NAME}/fp32-xnnpack-custom
 
         mkdir -p examples/demo-apps/android/LlamaDemo/app/libs
         cp aar-out/executorch.aar examples/demo-apps/android/LlamaDemo/app/libs
@@ -96,7 +101,7 @@ jobs:
           curl -O https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/llm_demo/app-debug.apk
           curl -O https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/llm_demo/app-debug-androidTest.apk
           curl -O https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/fp32-xnnpack-custom/model.zip
-          curl -o android-test-debug-androidTest.apk https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/library_test_dir/executorch-debug-androidTest.apk
+          curl -o android-test-debug-androidTest.apk https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/library_test_dir/executorch_android-debug-androidTest.apk
           unzip model.zip
           mv *.pte model.pte
 
 
@@ -49,4 +49,6 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         set -eux
+        # This is needed to get the prebuilt PyTorch wheel from S3
+        ${CONDA_RUN} --no-capture-output pip install awscli==1.37.21
         .ci/scripts/unittest-macos.sh --build-tool "${{ inputs.build-tool }}" --build-mode "${{ inputs.build-mode }}" --editable "${{ inputs.editable }}"
@@ -228,7 +228,7 @@ jobs:
     name: test-coreml-delegate
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
-      runner: macos-13-xlarge
+      runner: macos-latest-xlarge
       python-version: '3.11'
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
 
@@ -412,6 +412,10 @@ python -m examples.models.llama.export_llama \
   -d fp32
 ```
 
+A few notes:
+- If your model shares embedding/unembedding weights (like Llama1B and Llama3B do), you can add `--use_shared_embedding` to take advantage of this and reduce memory.  When this option is enabled, you can specify whether embeddings are quantized with weight zeros or not by specifying a third argument.  For example, `-E "torchao:4,32,true"` means that the embedding is quantized to 4-bits with group_size=32 and uses weight zeros (this is the default behavior if you simply use `-E "torchao:4,32"`), whereas `-E "torchao:4,32,false"` means that the embedding is quantized to 4-bits with group_size=32, but is quantized with scales-only.  If `--use_shared_embedding` is specified, the unembedding (i.e., the final linear layer) is quantized in the same way, but also uses 8-bit dynamically quantized activations.
+- To do channelwise quantization, specify group_size to 0.  This works for both linear and embedding layers.
+
 Once the model is exported, we need to build ExecuTorch and the runner with the low-bit kernels.
 
 The first step is to install ExecuTorch (the same as step 3.1 above):
 
@@ -96,6 +96,7 @@
     "static_llama",
     "qwen2_5",
     "phi-4-mini",
+    "smollm2",
 ]
 TORCHTUNE_DEFINED_MODELS = ["llama3_2_vision"]
 
@@ -155,6 +156,11 @@ def build_args_parser() -> argparse.ArgumentParser:
         type=str,
         help="type of embedding quantization, '<bitwidth>,<groupsize>', e.g., '8,1024'.",
     )
+    parser.add_argument(
+        "--use_shared_embedding",
+        action="store_true",
+        help="Whether the embedding/unembedding weights should be shared.  Only available with torchao kernels.",
+    )
     parser.add_argument(
         "--pt2e_quantize",
         default=None,
@@ -684,6 +690,15 @@ def _validate_args(args):
     if args.num_sharding > 0 and not args.qnn:
         raise ValueError("Model shard is only supported with qnn backend now.")
 
+    if args.use_shared_embedding:
+        if not (
+            args.embedding_quantize is not None
+            and args.embedding_quantize.startswith("torchao:")
+        ):
+            raise ValueError(
+                "Shared embedding is only supported with torchao quantization."
+            )
+
     if (
         args.quantization_mode is not None
         and args.quantization_mode.startswith("torchao:")
@@ -1122,6 +1137,21 @@ def _get_source_transforms(  # noqa
 
             transforms.append(inject_fast_hadamard_transform_native_for_spin_quant)
 
+    if args.embedding_quantize:
+        """
+        When this option is selected, it finds all embedding layers and transforms
+        into quantized embedding equivalent module.
+
+        There are cases where the checkpoint is already quantized, for example
+        on use_spin_quant is enabled. In that case, it will do the appropriate
+        transformations based on the given checkpoint first. In those cases,
+        this wil be a no-op.
+        """
+        modelname = f"{modelname}_e"
+        transforms.append(get_quant_embedding_transform(args, checkpoint_dtype))
+
+    # quantization_mode should be applied after embedding_quantize
+    # to support shared_embedding
     if args.quantization_mode:
         """
         When this option is selected, it finds all linear layers and transforms
@@ -1145,19 +1175,6 @@ def _get_source_transforms(  # noqa
             )
         )
 
-    if args.embedding_quantize:
-        """
-        When this option is selected, it finds all embedding layers and transforms
-        into quantized embedding equivalent module.
-
-        There are cases where the checkpoint is already quantized, for example
-        on use_spin_quant is enabled. In that case, it will do the appropriate
-        transformations based on the given checkpoint first. In those cases,
-        this wil be a no-op.
-        """
-        modelname = f"{modelname}_e"
-        transforms.append(get_quant_embedding_transform(args, checkpoint_dtype))
-
     if args.expand_rope_table:
         transforms.append(materialze_broadcast_of_rope_freq_cis)
 
 
@@ -124,9 +124,7 @@ def quantize(  # noqa C901
                 model,
                 Int8DynamicActivationIntxWeightConfig(
                     weight_dtype=getattr(torch, f"int{bitwidth}"),
-                    granularity=(
-                        PerRow() if group_size in [0, -1] else PerGroup(group_size)
-                    ),
+                    granularity=(PerRow() if group_size == 0 else PerGroup(group_size)),
                     has_weight_zeros=False,
                 ),
             )
@@ -786,19 +784,43 @@ def forward(self, indices: torch.Tensor) -> torch.Tensor:
 
 def get_quant_embedding_transform(args, dtype_override: Optional[DType] = None):
     if args.embedding_quantize.startswith("torchao:"):
-        bitwidth, group_size = args.embedding_quantize.split(":")[1].split(",")
+        from torchao.experimental.quant_api import (
+            EmbeddingQuantizer,
+            SharedEmbeddingQuantizer,
+        )
+        from torchao.quantization.granularity import PerGroup, PerRow
+
+        quant_args = args.embedding_quantize.split(":")[1].split(",")
+        if len(quant_args) == 2:
+            bitwidth, group_size = quant_args
+            has_weight_zeros = True
+        else:
+            bitwidth, group_size, has_weight_zeros = quant_args
+
+        if group_size in ["none", "None", "0"]:
+            group_size = 0
+
         group_size = int(group_size)
         bitwidth = int(bitwidth)
-        from torchao.experimental.quant_api import IntxWeightEmbeddingQuantizer
+        has_weight_zeros = bool(has_weight_zeros)
+        weight_dtype = getattr(torch, f"int{bitwidth}")
+        granularity = PerRow() if group_size == 0 else PerGroup(group_size)
 
         def _torchao_embedding_quantizer(model):
             with torch.no_grad():
-                model = IntxWeightEmbeddingQuantizer(
-                    device="cpu",
-                    precision=torch.float32,
-                    bitwidth=bitwidth,
-                    groupsize=group_size,
-                ).quantize(model)
+                if not args.use_shared_embedding:
+                    EmbeddingQuantizer(
+                        weight_dtype=weight_dtype,
+                        granularity=granularity,
+                        has_weight_zeros=has_weight_zeros,
+                        use_fallback=False,
+                    ).quantize(model)
+                else:
+                    SharedEmbeddingQuantizer(
+                        weight_dtype=weight_dtype,
+                        granularity=granularity,
+                        has_weight_zeros=has_weight_zeros,
+                    ).quantize(model)
             return model
 
         return _torchao_embedding_quantizer
 
@@ -0,0 +1,63 @@
+## Summary
+Phi-4-mini Instruct (3.8B) is a newly released version of the popular Phi-4 model developed by Microsoft.
+
+## Instructions
+
+Phi-4-mini uses the same example code as Llama, while the checkpoint, model params, and tokenizer are different. Please see the [Llama README page](../llama/README.md) for details.
+
+All commands for exporting and running Llama on various backends should also be applicable to Phi-4-mini, by swapping the following args:
+```
+--model phi_4_mini
+--params examples/models/phi-4-mini/config.json
+--checkpoint <path-to-meta-checkpoint>
+```
+
+### Generate the Checkpoint
+The original checkpoint can be obtained from HuggingFace:
+```
+huggingface-cli download microsoft/Phi-4-mini-instruct
+```
+
+We then convert it to Meta's checkpoint format:
+```
+python examples/models/phi-4-mini/convert_weights.py <path-to-checkpoint-dir> <output-path>
+```
+
+### Example export and run
+Here is an basic example for exporting and running Phi-4-mini, although please refer to [Llama README page](../llama/README.md) for more advanced usage.
+
+Export to XNNPack, no quantization:
+```
+# No quantization
+# Set these paths to point to the downloaded files
+PHI_CHECKPOINT=path/to/checkpoint.pth
+
+python -m examples.models.llama.export_llama \
+  --model phi_4_mini \
+  --checkpoint "${PHI_CHECKPOINT=path/to/checkpoint.pth:?}" \
+  --params examples/models/phi-4-mini/config.json \
+  -kv \
+  --use_sdpa_with_kv_cache \
+  -d fp32 \
+  -X \
+  --metadata '{"get_bos_id":151643, "get_eos_ids":[151643]}' \
+  --output_name="phi-4-mini.pte"
+  --verbose
+```
+
+Run using the executor runner:
+```
+# Currently a work in progress, just need to enable HuggingFace json tokenizer in C++.
+# In the meantime, can run with an example Python runner with pybindings:
+
+python -m examples.models.llama.runner.native
+  --model phi_4_mini
+  --pte <path-to-pte>
+  -kv
+  --tokenizer <path-to-tokenizer>/tokenizer.json
+  --tokenizer_config <path-to_tokenizer>/tokenizer_config.json
+  --prompt "What is in a california roll?"
+  --params examples/models/phi-4-mini/config.json
+  --max_len 64
+  --temperature 0
+```