agrima1304
diff --git a/‎.ci/docker/ci_commit_pins/optimum-executorch.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/optimum-executorch.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/docker/ci_commit_pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/docker/requirements-ci.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/requirements-ci.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/scripts/test_phi_3_mini.sh‎
Lines changed: 11 additions & 12 deletions b/‎.ci/scripts/test_phi_3_mini.sh‎
Lines changed: 11 additions & 12 deletions
diff --git a/‎.ci/scripts/utils.sh‎
Lines changed: 39 additions & 0 deletions b/‎.ci/scripts/utils.sh‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎.github/workflows/cuda.yml‎
Lines changed: 30 additions & 8 deletions b/‎.github/workflows/cuda.yml‎
Lines changed: 30 additions & 8 deletions
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 6 additions & 2 deletions b/‎.github/workflows/pull.yml‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 16 additions & 29 deletions b/‎CMakeLists.txt‎
Lines changed: 16 additions & 29 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 2 additions & 2 deletions b/‎CONTRIBUTING.md‎
Lines changed: 2 additions & 2 deletions
@@ -1 +1 @@
-44d8d54e38c0258357d4e92e1fefe21e845947a3
+e8f76b4295584c4328e7fd7971c131cb341c7438
@@ -1 +1 @@
-53a2908a10f414a2f85caa06703a26a40e873869
+e6f766c7d750d40603eee3f66c5915bac606b3ea
@@ -6,7 +6,7 @@ sympy==1.12
 timm==0.6.13
 tomli==2.0.1
 torchsr==1.0.4
-transformers==4.47.1
+transformers==4.56.1
 zstd==1.5.5.1
 pandas>=2.2.2; python_version >= '3.10'
 pytest==7.2.0
 
@@ -36,34 +36,33 @@ cmake_build_phi_3_mini() {
   cmake --build ${BUILD_DIR}/${MODEL_DIR} -j${NPROC} --config ${BUILD_TYPE}
 }
 
-# Download and convert tokenizer.model
+# Download tokenizer.model
 prepare_tokenizer() {
-  echo "Downloading and converting tokenizer.model"
-  wget -O tokenizer.model "https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/tokenizer.model?download=true"
-  $PYTHON_EXECUTABLE -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
+  echo "Downloading tokenizer.model"
+  wget -O tokenizer.model "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/resolve/main/tokenizer.model?download=true"
 }
 
 # Export phi-3-mini model to pte
 export_phi_3_mini () {
   echo "Exporting phi-3-mini. This will take a few minutes"
-  $PYTHON_EXECUTABLE -m executorch.examples.models.phi-3-mini.export_phi-3-mini -c "4k" -s 128 -o phi-3-mini.pte
+  optimum-cli export executorch --model microsoft/Phi-3-mini-4k-instruct --task text-generation --recipe xnnpack --output_dir ./
 }
 
 run_and_verify() {
     NOW=$(date +"%H:%M:%S")
     echo "Starting to run phi-3-mini runner at ${NOW}"
-    if [[ ! -f "phi-3-mini.pte" ]]; then
-        echo "Export failed. Abort"
+    if [[ ! -f "model.pte" ]]; then
+        echo "Missing model artifact. Abort"
         exit 1
     fi
-    if [[ ! -f "tokenizer.bin" ]]; then
-        echo "tokenizer.bin is missing."
+    if [[ ! -f "tokenizer.model" ]]; then
+        echo "tokenizer.model is missing."
         exit 1
     fi
 
     ${BUILD_DIR}/${MODEL_DIR}/phi_3_mini_runner \
-    --model_path=phi-3-mini.pte \
-    --tokenizer_path=tokenizer.bin \
+    --model_path=model.pte \
+    --tokenizer_path=tokenizer.model \
     --seq_len=60 \
     --temperature=0 \
     --prompt="<|system|>
@@ -92,7 +91,7 @@ What is the capital of France?<|end|>
 cmake_install_executorch_libraries
 cmake_build_phi_3_mini
 
-# Step 2. Export the tokenizer and model
+# Step 2. Export the model
 prepare_tokenizer
 export_phi_3_mini
 
 
@@ -44,6 +44,44 @@ install_pip_dependencies() {
   popd || return
 }
 
+dedupe_macos_loader_path_rpaths() {
+  if [[ "$(uname)" != "Darwin" ]]; then
+    return
+  fi
+
+  local torch_lib_dir
+  pushd ..
+  torch_lib_dir=$(python -c "import importlib.util; print(importlib.util.find_spec('torch').submodule_search_locations[0])")/lib
+  popd
+  
+  if [[ -z "${torch_lib_dir}" || ! -d "${torch_lib_dir}" ]]; then
+    return
+  fi
+
+  local torch_libs=(
+    "libtorch_cpu.dylib"
+    "libtorch.dylib"
+    "libc10.dylib"
+  )
+
+  for lib_name in "${torch_libs[@]}"; do
+    local lib_path="${torch_lib_dir}/${lib_name}"
+    if [[ ! -f "${lib_path}" ]]; then
+      continue
+    fi
+
+    local removed=0
+    # Repeatedly remove the @loader_path rpath entries until none remain.
+    while install_name_tool -delete_rpath @loader_path "${lib_path}" 2>/dev/null; do
+      removed=1
+    done
+
+    if [[ "${removed}" == "1" ]]; then
+      install_name_tool -add_rpath @loader_path "${lib_path}" || true
+    fi
+  done
+}
+
 install_domains() {
   echo "Install torchvision and torchaudio"
   pip install --no-use-pep517 --user "git+https://github.com/pytorch/audio.git@${TORCHAUDIO_VERSION}"
@@ -101,6 +139,7 @@ install_pytorch_and_domains() {
     echo "Use cached wheel at ${cached_torch_wheel}"
   fi
 
+  dedupe_macos_loader_path_rpaths
   # Grab the pinned audio and vision commits from PyTorch
   TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)
   export TORCHAUDIO_VERSION
 
@@ -88,14 +88,26 @@ jobs:
         PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
 
   export-voxtral-cuda-artifact:
-    name: export-voxtral-cuda-artifact
+    name: export-voxtral-cuda-${{ matrix.quant.name }}
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
       contents: read
     secrets: inherit
     strategy:
       fail-fast: false
+      matrix:
+        quant:
+          - name: "non-quantized"
+            artifact: "voxtral-cuda-export"
+            extra_args: ""
+          - name: "quantized-int4-tile-packed"
+            artifact: "voxtral-cuda-quantized-int4-tile-packed"
+            extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
+          - name: "quantized-int4-weight-only"
+            artifact: "voxtral-cuda-quantized-int4-weight-only"
+            # TODO: adding "--qlinear 4w" produces invalid results. Need further investigation.
+            extra_args: "--qlinear_encoder 4w"
     with:
       timeout: 90
       secrets-env: EXECUTORCH_HF_TOKEN
@@ -104,7 +116,7 @@ jobs:
       gpu-arch-version: 12.6
       use-custom-docker-registry: false
       submodules: recursive
-      upload-artifact: voxtral-cuda-export
+      upload-artifact: ${{ matrix.quant.artifact }}
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         set -eux
@@ -122,14 +134,16 @@ jobs:
         pip list
         echo "::endgroup::"
 
-        echo "::group::Export Voxtral"
+        echo "::group::Export Voxtral (${{ matrix.quant.name }})"
+        EXTRA_ARGS="${{ matrix.quant.extra_args }}"
         optimum-cli export executorch \
             --model "mistralai/Voxtral-Mini-3B-2507" \
             --task "multimodal-text-to-text" \
             --recipe "cuda" \
             --dtype bfloat16 \
             --device cuda \
             --max_seq_len 1024 \
+            ${EXTRA_ARGS} \
             --output_dir ./
         python -m executorch.extension.audio.mel_spectrogram \
             --feature_size 128 \
@@ -142,7 +156,7 @@ jobs:
         test -f voxtral_preprocessor.pte
         echo "::endgroup::"
 
-        echo "::group::Store Voxtral Artifacts"
+        echo "::group::Store Voxtral Artifacts (${{ matrix.quant.name }})"
         mkdir -p "${RUNNER_ARTIFACT_DIR}"
         cp model.pte "${RUNNER_ARTIFACT_DIR}/"
         cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
@@ -201,22 +215,30 @@ jobs:
         echo "::endgroup::"
 
   test-voxtral-cuda-e2e:
-    name: test-voxtral-cuda-e2e
+    name: test-voxtral-cuda-e2e-${{ matrix.format.name }}
     needs: export-voxtral-cuda-artifact
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
       contents: read
     strategy:
       fail-fast: false
+      matrix:
+        format:
+          - name: "non-quantized"
+            artifact: "voxtral-cuda-export"
+          - name: "quantized-int4-tile-packed"
+            artifact: "voxtral-cuda-quantized-int4-tile-packed"
+          - name: "quantized-int4-weight-only"
+            artifact: "voxtral-cuda-quantized-int4-weight-only"
     with:
       timeout: 90
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
       gpu-arch-version: 12.6
       use-custom-docker-registry: false
       submodules: recursive
-      download-artifact: voxtral-cuda-export
+      download-artifact: ${{ matrix.format.artifact }}
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         set -eux
@@ -226,7 +248,7 @@ jobs:
         pip list
         echo "::endgroup::"
 
-        echo "::group::Prepare Voxtral Artifacts"
+        echo "::group::Prepare Voxtral Artifacts (${{ matrix.format.name }})"
         cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
         cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
         cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" .
@@ -255,7 +277,7 @@ jobs:
         cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
         echo "::endgroup::"
 
-        echo "::group::Run Voxtral Runner"
+        echo "::group::Run Voxtral Runner (${{ matrix.format.name }})"
         set +e
         export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
         OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \
 
@@ -351,6 +351,7 @@ jobs:
 
         # reinstall executorch
         bash ./install_executorch.sh --minimal
+        pip list
 
         # run python unittest
         python -m unittest examples.models.moshi.mimi.test_mimi
@@ -631,11 +632,14 @@ jobs:
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
-
+        echo "::group::Setup ExecuTorch"
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake"
-
+        echo "::endgroup::"
+                
+        echo "::group::Setup requirements"
         # install phi-3-mini requirements
         bash examples/models/phi-3-mini/install_requirements.sh
+        echo "::endgroup::"
 
         # run e2e (export, tokenizer and runner)
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_phi_3_mini.sh Release
 
@@ -290,6 +290,7 @@ jobs:
           - test_arm_baremetal: test_models_ethos-u85
           - test_arm_baremetal: test_smaller_stories_llama
           - test_arm_baremetal: test_memory_allocation
+          - test_arm_baremetal: test_model_smollm2-135M
       fail-fast: false
     with:
       runner: linux.2xlarge.memory
 
@@ -99,28 +99,6 @@ announce_configured_options(CCACHE_PROGRAM)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
-# Setup RPATH. See
-# https://gitlab.kitware.com/cmake/community/-/wikis/doc/cmake/RPATH-handling
-# Use separate rpaths during build and install phases
-set(CMAKE_SKIP_BUILD_RPATH OFF)
-# Don't use the install-rpath during the build phase
-set(CMAKE_BUILD_WITH_INSTALL_RPATH ON)
-# Automatically add all linked folders that are NOT in the build directory to
-# the rpath (per library?)
-#
-# TODO: Doesn't work for us right now because we are not installing .so's into
-# the correct locations. For example we have libcustom_ops_aot_lib.so depending
-# on _portable_lib.so, which was eventually put under
-# <site-packages>/executorch/extension/pybindings/ but this rpath is not
-# automatically added because at build time it seems `portable_lib` is being
-# built under the same directory, so no extra rpath is being added. To properly
-# fix this we need to install `portable_lib` into the correct path.
-set(CMAKE_INSTALL_RPATH_USE_LINK_PATH ON)
-# ------------------------------ OPTIONS -------------------------------------
-# WARNING: Please don't add example specific options in this CMakeLists.txt.
-# Instead please use `find_package(executorch REQUIRED)` in the example
-# directory and add a new executable in the example `CMakeLists.txt`.
-
 if(NOT EXECUTORCH_ENABLE_LOGGING)
   # Avoid pulling in the logging strings, which can be large. Note that this
   # will set the compiler flag for all targets in this directory, and for all
@@ -605,15 +583,23 @@ if(EXECUTORCH_BUILD_CORTEX_M)
   list(APPEND _executorch_backends coretex_m_backend)
 endif()
 
-if(EXECUTORCH_BUILD_CUDA)
-  # Build common AOTI functionality (required for CUDA)
+# Build common AOTI functionality if needed by CUDA or Metal backends
+if(EXECUTORCH_BUILD_CUDA OR EXECUTORCH_BUILD_METAL)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti)
+endif()
+
+if(EXECUTORCH_BUILD_CUDA)
   # Build CUDA-specific AOTI functionality
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cuda)
   # Add aoti_cuda to backends - it already depends on aoti_common
   list(APPEND _executorch_backends aoti_cuda)
 endif()
 
+if(EXECUTORCH_BUILD_METAL)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/metal)
+  list(APPEND _executorch_backends metal_backend)
+endif()
+
 if(EXECUTORCH_BUILD_EXTENSION_APPLE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/apple)
 endif()
@@ -901,12 +887,13 @@ if(EXECUTORCH_BUILD_PYBIND)
 
   # Set RPATH to find PyTorch libraries relative to the installation location
   # This goes from executorch/extension/pybindings up to site-packages, then to
-  # torch/lib
+  # torch/lib. Don't do this to APPLE, as it will error out on the following
+  # error:
+  #
   if(APPLE)
-    set_target_properties(
-      portable_lib PROPERTIES BUILD_RPATH "@loader_path/../../../torch/lib"
-                              INSTALL_RPATH "@loader_path/../../../torch/lib"
-    )
+    # Skip setting @loader_path for APPLE, since it causes error like ld:
+    # duplicate LC_RPATH '@loader_path' in '<site-packages>/torch/lib/
+    # libtorch_cpu.dylib'
   else()
     set_target_properties(
       portable_lib PROPERTIES BUILD_RPATH "$ORIGIN/../../../torch/lib"
 
@@ -24,8 +24,8 @@ For Apple, please refer to the [iOS documentation](docs/source/using-executorch-
 executorch
 ├── <a href="backends">backends</a> - Backend delegate implementations for various hardware targets. Each backend uses partitioner to split the graph into subgraphs that can be executed on specific hardware, quantizer to optimize model precision, and runtime components to execute the graph on target hardware. For details refer to the <a href="docs/source/backend-delegates-integration.md">backend documentation</a> and the <a href="docs/source/using-executorch-export.md">Export and Lowering tutorial</a> for more information.
 │   ├── <a href="backends/apple">apple</a> - Apple-specific backends.
-│   │   ├── <a href="backends/apple/coreml">coreml</a> - CoreML backend for Apple devices. See <a href="docs/source/backends-coreml.md">doc</a>.
-│   │   └── <a href="backends/apple/mps">mps</a> - Metal Performance Shaders backend for Apple devices. See <a href="docs/source/backends-mps.md">doc</a>.
+│   │   ├── <a href="backends/apple/coreml">coreml</a> - CoreML backend for Apple devices. See <a href="docs/source/backends/coreml/coreml-overview.md">doc</a>.
+│   │   └── <a href="backends/apple/mps">mps</a> - Metal Performance Shaders backend for Apple devices. See <a href="docs/source/backends/mps/mps-overview.md">doc</a>.
 │   ├── <a href="backends/arm">arm</a> - ARM architecture backends. See <a href="docs/source/backends-arm-ethos-u.md">doc</a>.
 │   ├── <a href="backends/cadence">cadence</a> - Cadence-specific backends. See <a href="docs/source/backends-cadence.md">doc</a>.
 │   ├── <a href="backends/example">example</a> - Example backend implementations.
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-44d8d54e38c0258357d4e92e1fefe21e845947a3`
	`1`	`+e8f76b4295584c4328e7fd7971c131cb341c7438`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-53a2908a10f414a2f85caa06703a26a40e873869`
	`1`	`+e6f766c7d750d40603eee3f66c5915bac606b3ea`