pytorch
diff --git a/‎.ci/docker/ci_commit_pins/optimum-executorch.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/optimum-executorch.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/docker/ci_commit_pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/scripts/utils.sh‎
Lines changed: 39 additions & 0 deletions b/‎.ci/scripts/utils.sh‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎.github/workflows/cuda.yml‎
Lines changed: 30 additions & 8 deletions b/‎.github/workflows/cuda.yml‎
Lines changed: 30 additions & 8 deletions
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/pull.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 6 additions & 27 deletions b/‎CMakeLists.txt‎
Lines changed: 6 additions & 27 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 2 additions & 2 deletions b/‎CONTRIBUTING.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎LICENSE‎
Lines changed: 1 addition & 0 deletions b/‎LICENSE‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README-wheel.md‎
Lines changed: 1 addition & 1 deletion b/‎README-wheel.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/aoti/aoti_delegate_handle.h‎
Lines changed: 6 additions & 0 deletions b/‎backends/aoti/aoti_delegate_handle.h‎
Lines changed: 6 additions & 0 deletions
@@ -1 +1 @@
-44d8d54e38c0258357d4e92e1fefe21e845947a3
+09fdbd0a0639b128f712a4f5202ed42ca4c60957
@@ -1 +1 @@
-53a2908a10f414a2f85caa06703a26a40e873869
+e6f766c7d750d40603eee3f66c5915bac606b3ea
@@ -44,6 +44,44 @@ install_pip_dependencies() {
   popd || return
 }
 
+dedupe_macos_loader_path_rpaths() {
+  if [[ "$(uname)" != "Darwin" ]]; then
+    return
+  fi
+
+  local torch_lib_dir
+  pushd ..
+  torch_lib_dir=$(python -c "import importlib.util; print(importlib.util.find_spec('torch').submodule_search_locations[0])")/lib
+  popd
+  
+  if [[ -z "${torch_lib_dir}" || ! -d "${torch_lib_dir}" ]]; then
+    return
+  fi
+
+  local torch_libs=(
+    "libtorch_cpu.dylib"
+    "libtorch.dylib"
+    "libc10.dylib"
+  )
+
+  for lib_name in "${torch_libs[@]}"; do
+    local lib_path="${torch_lib_dir}/${lib_name}"
+    if [[ ! -f "${lib_path}" ]]; then
+      continue
+    fi
+
+    local removed=0
+    # Repeatedly remove the @loader_path rpath entries until none remain.
+    while install_name_tool -delete_rpath @loader_path "${lib_path}" 2>/dev/null; do
+      removed=1
+    done
+
+    if [[ "${removed}" == "1" ]]; then
+      install_name_tool -add_rpath @loader_path "${lib_path}" || true
+    fi
+  done
+}
+
 install_domains() {
   echo "Install torchvision and torchaudio"
   pip install --no-use-pep517 --user "git+https://github.com/pytorch/audio.git@${TORCHAUDIO_VERSION}"
@@ -101,6 +139,7 @@ install_pytorch_and_domains() {
     echo "Use cached wheel at ${cached_torch_wheel}"
   fi
 
+  dedupe_macos_loader_path_rpaths
   # Grab the pinned audio and vision commits from PyTorch
   TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)
   export TORCHAUDIO_VERSION
 
@@ -88,14 +88,26 @@ jobs:
         PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
 
   export-voxtral-cuda-artifact:
-    name: export-voxtral-cuda-artifact
+    name: export-voxtral-cuda-${{ matrix.quant.name }}
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
       contents: read
     secrets: inherit
     strategy:
       fail-fast: false
+      matrix:
+        quant:
+          - name: "non-quantized"
+            artifact: "voxtral-cuda-export"
+            extra_args: ""
+          - name: "quantized-int4-tile-packed"
+            artifact: "voxtral-cuda-quantized-int4-tile-packed"
+            extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
+          - name: "quantized-int4-weight-only"
+            artifact: "voxtral-cuda-quantized-int4-weight-only"
+            # TODO: adding "--qlinear 4w" produces invalid results. Need further investigation.
+            extra_args: "--qlinear_encoder 4w"
     with:
       timeout: 90
       secrets-env: EXECUTORCH_HF_TOKEN
@@ -104,7 +116,7 @@ jobs:
       gpu-arch-version: 12.6
       use-custom-docker-registry: false
       submodules: recursive
-      upload-artifact: voxtral-cuda-export
+      upload-artifact: ${{ matrix.quant.artifact }}
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         set -eux
@@ -122,14 +134,16 @@ jobs:
         pip list
         echo "::endgroup::"
 
-        echo "::group::Export Voxtral"
+        echo "::group::Export Voxtral (${{ matrix.quant.name }})"
+        EXTRA_ARGS="${{ matrix.quant.extra_args }}"
         optimum-cli export executorch \
             --model "mistralai/Voxtral-Mini-3B-2507" \
             --task "multimodal-text-to-text" \
             --recipe "cuda" \
             --dtype bfloat16 \
             --device cuda \
             --max_seq_len 1024 \
+            ${EXTRA_ARGS} \
             --output_dir ./
         python -m executorch.extension.audio.mel_spectrogram \
             --feature_size 128 \
@@ -142,7 +156,7 @@ jobs:
         test -f voxtral_preprocessor.pte
         echo "::endgroup::"
 
-        echo "::group::Store Voxtral Artifacts"
+        echo "::group::Store Voxtral Artifacts (${{ matrix.quant.name }})"
         mkdir -p "${RUNNER_ARTIFACT_DIR}"
         cp model.pte "${RUNNER_ARTIFACT_DIR}/"
         cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
@@ -201,22 +215,30 @@ jobs:
         echo "::endgroup::"
 
   test-voxtral-cuda-e2e:
-    name: test-voxtral-cuda-e2e
+    name: test-voxtral-cuda-e2e-${{ matrix.format.name }}
     needs: export-voxtral-cuda-artifact
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
       contents: read
     strategy:
       fail-fast: false
+      matrix:
+        format:
+          - name: "non-quantized"
+            artifact: "voxtral-cuda-export"
+          - name: "quantized-int4-tile-packed"
+            artifact: "voxtral-cuda-quantized-int4-tile-packed"
+          - name: "quantized-int4-weight-only"
+            artifact: "voxtral-cuda-quantized-int4-weight-only"
     with:
       timeout: 90
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
       gpu-arch-version: 12.6
       use-custom-docker-registry: false
       submodules: recursive
-      download-artifact: voxtral-cuda-export
+      download-artifact: ${{ matrix.format.artifact }}
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         set -eux
@@ -226,7 +248,7 @@ jobs:
         pip list
         echo "::endgroup::"
 
-        echo "::group::Prepare Voxtral Artifacts"
+        echo "::group::Prepare Voxtral Artifacts (${{ matrix.format.name }})"
         cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
         cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
         cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" .
@@ -255,7 +277,7 @@ jobs:
         cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
         echo "::endgroup::"
 
-        echo "::group::Run Voxtral Runner"
+        echo "::group::Run Voxtral Runner (${{ matrix.format.name }})"
         set +e
         export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
         OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \
 
@@ -351,6 +351,7 @@ jobs:
 
         # reinstall executorch
         bash ./install_executorch.sh --minimal
+        pip list
 
         # run python unittest
         python -m unittest examples.models.moshi.mimi.test_mimi
 
@@ -99,28 +99,6 @@ announce_configured_options(CCACHE_PROGRAM)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
-# Setup RPATH. See
-# https://gitlab.kitware.com/cmake/community/-/wikis/doc/cmake/RPATH-handling
-# Use separate rpaths during build and install phases
-set(CMAKE_SKIP_BUILD_RPATH OFF)
-# Don't use the install-rpath during the build phase
-set(CMAKE_BUILD_WITH_INSTALL_RPATH ON)
-# Automatically add all linked folders that are NOT in the build directory to
-# the rpath (per library?)
-#
-# TODO: Doesn't work for us right now because we are not installing .so's into
-# the correct locations. For example we have libcustom_ops_aot_lib.so depending
-# on _portable_lib.so, which was eventually put under
-# <site-packages>/executorch/extension/pybindings/ but this rpath is not
-# automatically added because at build time it seems `portable_lib` is being
-# built under the same directory, so no extra rpath is being added. To properly
-# fix this we need to install `portable_lib` into the correct path.
-set(CMAKE_INSTALL_RPATH_USE_LINK_PATH ON)
-# ------------------------------ OPTIONS -------------------------------------
-# WARNING: Please don't add example specific options in this CMakeLists.txt.
-# Instead please use `find_package(executorch REQUIRED)` in the example
-# directory and add a new executable in the example `CMakeLists.txt`.
-
 if(NOT EXECUTORCH_ENABLE_LOGGING)
   # Avoid pulling in the logging strings, which can be large. Note that this
   # will set the compiler flag for all targets in this directory, and for all
@@ -909,12 +887,13 @@ if(EXECUTORCH_BUILD_PYBIND)
 
   # Set RPATH to find PyTorch libraries relative to the installation location
   # This goes from executorch/extension/pybindings up to site-packages, then to
-  # torch/lib
+  # torch/lib. Don't do this to APPLE, as it will error out on the following
+  # error:
+  #
   if(APPLE)
-    set_target_properties(
-      portable_lib PROPERTIES BUILD_RPATH "@loader_path/../../../torch/lib"
-                              INSTALL_RPATH "@loader_path/../../../torch/lib"
-    )
+    # Skip setting @loader_path for APPLE, since it causes error like ld:
+    # duplicate LC_RPATH '@loader_path' in '<site-packages>/torch/lib/
+    # libtorch_cpu.dylib'
   else()
     set_target_properties(
       portable_lib PROPERTIES BUILD_RPATH "$ORIGIN/../../../torch/lib"
 
@@ -24,8 +24,8 @@ For Apple, please refer to the [iOS documentation](docs/source/using-executorch-
 executorch
 ├── <a href="backends">backends</a> - Backend delegate implementations for various hardware targets. Each backend uses partitioner to split the graph into subgraphs that can be executed on specific hardware, quantizer to optimize model precision, and runtime components to execute the graph on target hardware. For details refer to the <a href="docs/source/backend-delegates-integration.md">backend documentation</a> and the <a href="docs/source/using-executorch-export.md">Export and Lowering tutorial</a> for more information.
 │   ├── <a href="backends/apple">apple</a> - Apple-specific backends.
-│   │   ├── <a href="backends/apple/coreml">coreml</a> - CoreML backend for Apple devices. See <a href="docs/source/backends-coreml.md">doc</a>.
-│   │   └── <a href="backends/apple/mps">mps</a> - Metal Performance Shaders backend for Apple devices. See <a href="docs/source/backends-mps.md">doc</a>.
+│   │   ├── <a href="backends/apple/coreml">coreml</a> - CoreML backend for Apple devices. See <a href="docs/source/backends/coreml/coreml-overview.md">doc</a>.
+│   │   └── <a href="backends/apple/mps">mps</a> - Metal Performance Shaders backend for Apple devices. See <a href="docs/source/backends/mps/mps-overview.md">doc</a>.
 │   ├── <a href="backends/arm">arm</a> - ARM architecture backends. See <a href="docs/source/backends-arm-ethos-u.md">doc</a>.
 │   ├── <a href="backends/cadence">cadence</a> - Cadence-specific backends. See <a href="docs/source/backends-cadence.md">doc</a>.
 │   ├── <a href="backends/example">example</a> - Example backend implementations.
 
@@ -9,6 +9,7 @@ Copyright (c) 2023 Apple Inc.
 Copyright (c) 2024 MediaTek Inc.
 Copyright 2023 NXP
 Copyright (c) 2025 Samsung Electronics Co. LTD
+Copyright (c) Intel Corporation
 
 Redistribution and use in source and binary forms, with or without modification,
 are permitted provided that the following conditions are met:
 
@@ -12,7 +12,7 @@ The prebuilt `executorch.runtime` module included in this package provides a way
 to run ExecuTorch `.pte` files, with some restrictions:
 * Only [core ATen operators](docs/source/ir-ops-set-definition.md) are linked into the prebuilt module
 * Only the [XNNPACK backend delegate](docs/source/backends-xnnpack.md) is linked into the prebuilt module.
-* \[macOS only] [Core ML](docs/source/backends-coreml.md) and [MPS](docs/source/backends-mps.md) backend
+* \[macOS only] [Core ML](docs/source/backends/coreml/coreml-overview.md) and [MPS](docs/source/backends/mps/mps-overview.md) backend
   are also linked into the prebuilt module.
 
 Please visit the [ExecuTorch website](https://pytorch.org/executorch) for
 
@@ -71,6 +71,11 @@ using AOTInductorModelContainerGetNumConstantsFunc = AOTIRuntimeError (*)(
     AOTInductorModelContainerHandle container_handle,
     size_t* num_constants);
 
+// Update the model container with the constant tensors
+using AOTInductorModelUpdateConstantsFromBlobFunc = AOTIRuntimeError (*)(
+    AOTInductorModelContainerHandle container_handle,
+    const uint8_t* weight_blob_ptr);
+
 } // extern "C"
 
 // AOTI Delegate Handle structure
@@ -87,6 +92,7 @@ struct AOTIDelegateHandle {
   AOTInductorModelContainerGetNumInputsFunc get_num_inputs;
   AOTInductorModelContainerGetNumOutputsFunc get_num_outputs;
   AOTInductorModelContainerRunFunc run;
+  AOTInductorModelUpdateConstantsFromBlobFunc update_constants_from_blob;
 };
 
 } // namespace aoti
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-44d8d54e38c0258357d4e92e1fefe21e845947a3`
	`1`	`+09fdbd0a0639b128f712a4f5202ed42ca4c60957`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-53a2908a10f414a2f85caa06703a26a40e873869`
	`1`	`+e6f766c7d750d40603eee3f66c5915bac606b3ea`