pytorch
diff --git a/‎.ci/scripts/setup-windows.ps1‎
Lines changed: 24 additions & 0 deletions b/‎.ci/scripts/setup-windows.ps1‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎.ci/scripts/test_model.ps1‎
Lines changed: 89 additions & 0 deletions b/‎.ci/scripts/test_model.ps1‎
Lines changed: 89 additions & 0 deletions
diff --git a/‎.ci/scripts/unittest-windows.ps1‎
Lines changed: 15 additions & 0 deletions b/‎.ci/scripts/unittest-windows.ps1‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎.github/workflows/_unittest.yml‎
Lines changed: 21 additions & 0 deletions b/‎.github/workflows/_unittest.yml‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎.github/workflows/android-perf.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/android-perf.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 27 additions & 3 deletions b/‎.github/workflows/trunk.yml‎
Lines changed: 27 additions & 3 deletions
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 0 deletions b/‎.gitignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎.gitmodules‎
Lines changed: 1 addition & 1 deletion b/‎.gitmodules‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/apple/coreml/TARGETS‎
Lines changed: 24 additions & 5 deletions b/‎backends/apple/coreml/TARGETS‎
Lines changed: 24 additions & 5 deletions
diff --git a/‎backends/apple/coreml/compiler/torch_ops.py‎
Lines changed: 39 additions & 5 deletions b/‎backends/apple/coreml/compiler/torch_ops.py‎
Lines changed: 39 additions & 5 deletions
@@ -0,0 +1,24 @@
+param (
+    [string]$editable = $false
+)
+
+conda create --yes --quiet -n et python=3.12
+conda activate et
+
+# Activate the VS environment - this is required for Dynamo to work, as it uses MSVC.
+# There are a bunch of environment variables that it requires.
+# See https://learn.microsoft.com/en-us/cpp/build/building-on-the-command-line.
+& "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools\Launch-VsDevShell.ps1" -Arch amd64
+
+# Install test dependencies
+pip install -r .ci/docker/requirements-ci.txt
+
+if ($editable -eq 'true') {
+    install_executorch.bat --editable
+} else {
+    install_executorch.bat
+}
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "Installation was unsuccessful. Exit code: $LASTEXITCODE."
+    exit $LASTEXITCODE
+}
@@ -0,0 +1,89 @@
+param (
+    [string]$modelName,
+    [string]$backend,
+    [string]$buildDir = "cmake-out",
+    [bool]$strict = $false
+)
+
+Set-PSDebug -Trace 1
+$ErrorActionPreference = 'Stop'
+$PSNativeCommandUseErrorActionPreference = $true
+
+function ExportModel-Portable {
+    param (
+        [string]$model_name,
+        [bool]$strict
+    )
+
+    $exportParams = "--model_name", "$modelName"
+    if ($strict) {
+        $exportParams += "--strict"
+    }
+    python -m examples.portable.scripts.export @exportParams | Write-Host
+    if ($LASTEXITCODE -ne 0) {
+        Write-Host "Model export failed. Exit code: $LASTEXITCODE."
+        exit $LASTEXITCODE
+    }
+
+    "$modelName.pte"
+}
+
+function ExportModel-Xnnpack {
+    param (
+        [string]$model_name,
+        [bool]$quantize
+    )
+
+    if $(quantize) {
+        python -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate --quantize | Write-Host
+        $modelFile = "$($modelName)_xnnpack_q8.pte"
+    } else {
+        python -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate | Write-Host
+        $modelFile = "$($modelName)_xnnpack_fp32.pte"
+    }
+    if ($LASTEXITCODE -ne 0) {
+        Write-Host "Model export failed. Exit code: $LASTEXITCODE."
+        exit $LASTEXITCODE
+    }
+
+    $modelFile
+}
+
+# Build the runner
+if (Test-Path -Path $buildDir) {
+    Remove-Item -Path $buildDir -Recurse -Force
+}
+New-Item -Path $buildDir -ItemType Directory
+Push-Location $buildDir
+cmake .. --preset windows
+cmake --build . -t executor_runner -j16 --config Release
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "Runner build failed. Exit code: $LASTEXITCODE."
+    exit $LASTEXITCODE
+}
+$executorBinaryPath = Join-Path -Path $buildDir -ChildPath "Release\executor_runner.exe"
+Pop-Location
+
+# Export the model
+switch ($backend) {
+    "portable" {
+        $model_path = ExportModel-Portable -model_name $modelName -strict $strict
+    }
+    "xnnpack-f32" {
+        $model_path = ExportModel-Xnnpack -model_name $modelName -quantize $false
+    }
+    "xnnpack-q8" {
+        $model_path = ExportModel-Xnnpack -model_name $modelName -quantize $true
+    }
+    default {
+        Write-Host "Unknown backend $backend."
+        exit 1
+    }
+}
+
+# Run the runner
+& "$executorBinaryPath" --model_path="$model_path"
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "Model execution failed. Exit code: $LASTEXITCODE."
+    exit $LASTEXITCODE
+}
@@ -0,0 +1,15 @@
+param (
+    [string]$editable = $false
+)
+
+Set-PSDebug -Trace 1
+$ErrorActionPreference = 'Stop'
+$PSNativeCommandUseErrorActionPreference = $true
+
+# Run pytest with coverage
+# pytest -n auto --cov=./ --cov-report=xml
+pytest -v --full-trace -c pytest-windows.ini
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "Pytest invocation was unsuccessful. Exit code: $LASTEXITCODE."
+    exit $LASTEXITCODE
+}
@@ -19,6 +19,7 @@ on:
         required: false
         type: string
         description: Install ExecuTorch in editable mode or not.
+        default: 'false'
       python-version:
         required: false
         type: string
@@ -52,3 +53,23 @@ jobs:
         # This is needed to get the prebuilt PyTorch wheel from S3
         ${CONDA_RUN} --no-capture-output pip install awscli==1.37.21
         .ci/scripts/unittest-macos.sh --build-tool "${{ inputs.build-tool }}" --build-mode "${{ inputs.build-mode }}" --editable "${{ inputs.editable }}"
+
+  windows:
+    if: ${{ inputs.build-tool == 'cmake' }}
+    uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
+    with:
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 120
+      script: |
+        conda init powershell
+
+        powershell -Command "& {
+          Set-PSDebug -Trace 1
+          \$ErrorActionPreference = 'Stop'
+          \$PSNativeCommandUseErrorActionPreference = \$true
+
+          .ci/scripts/setup-windows.ps1       
+
+          powershell .ci/scripts/unittest-windows.ps1 -editable "${{ inputs.editable }}"
+        }"
@@ -72,7 +72,7 @@ jobs:
           # Separate default values from the workflow dispatch. To ensure defaults are accessible
           # during scheduled runs and to provide flexibility for different defaults between
           # on-demand and periodic benchmarking.
-          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'Qwen/Qwen3-0.6B' }}
+          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'Qwen/Qwen3-0.6B' }}
           CRON_DEFAULT_DEVICES: samsung_galaxy_s22+public
         run: |
           set -eux
 
@@ -823,10 +823,10 @@ jobs:
           --tsv_path ${TSV_PATH}
         echo "::endgroup::"
 
-  test-huggingface-transformers-coreml:
+  test-huggingface-transformers-macos:
     # NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
     if: ${{ !github.event.pull_request.head.repo.fork }}
-    name: test-huggingface-transformers-coreml
+    name: test-huggingface-transformers-macos
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     permissions:
       id-token: write
@@ -844,10 +844,10 @@ jobs:
           # phi4-mini|xnnpack|--quantize,
           # smollm2-135m|xnnpack|--quantize,
           # smollm3-3b|xnnpack|--quantize,
+          # qwen3-1.7b|xnnpack|--quantize,
           # CoreML.
           llama3.2-1b|coreml_fp32_gpu|--quantize,
           qwen3-0.6b|coreml_fp32_gpu|--quantize,
-          qwen3-1.7b|xnnpack|--quantize,
           smollm2-135m|coreml_fp32_gpu|--quantize,
           olmo-1b|coreml_fp32_gpu|--quantize,
           bert|coreml_fp32_gpu|--quantize,
@@ -979,3 +979,27 @@ jobs:
         # Run MCU models
         chmod +x examples/arm/run_mcu_models_fvp.sh
         examples/arm/run_mcu_models_fvp.sh --target=cortex-m55
+
+  test-models-windows:
+    uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
+    strategy:
+      fail-fast: false
+      matrix:
+        model: [linear, add, add_mul, ic3, ic4, mv2, mv3, resnet18, resnet50, vit, w2l, mobilebert, emformer_join, emformer_transcribe]
+        backend: [portable, xnnpack-f32, xnnpack-q8]
+    with:
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 60
+      script: |
+        conda init powershell
+        
+        powershell -Command "& {
+          Set-PSDebug -Trace 1
+          \$ErrorActionPreference = 'Stop'
+          \$PSNativeCommandUseErrorActionPreference = \$true
+
+          .ci/scripts/setup-windows.ps1       
+
+          powershell .ci/scripts/test_model.ps1 -modelName ${{ matrix.model }} -backend ${{ matrix.backend }}
+        }"
@@ -65,3 +65,7 @@ xcuserdata/
 
 # Android
 *.aar
+
+# Windows
+*.dll
+*.pyd
@@ -27,7 +27,7 @@
 	url = https://github.com/google/pthreadpool.git
 [submodule "extension/llm/tokenizers"]
 	path = extension/llm/tokenizers
-	url = https://github.com/pytorch-labs/tokenizers.git
+	url = https://github.com/meta-pytorch/tokenizers.git
 [submodule "kernels/optimized/third-party/eigen"]
 	path = kernels/optimized/third-party/eigen
 	url = https://gitlab.com/libeigen/eigen.git
 
@@ -61,16 +61,21 @@ runtime.python_library(
 )
 
 runtime.python_library(
-    name = "recipes",
-    srcs = glob([
-        "recipes/*.py",
-    ]),
+    name = "coreml_recipes",
+    srcs = [
+        "recipes/__init__.py",
+        "recipes/coreml_recipe_provider.py"
+    ],
     visibility = [
         "@EXECUTORCH_CLIENTS",
+        "//executorch/export/...",
     ],
     deps = [
         "fbsource//third-party/pypi/coremltools:coremltools",
+        ":coreml_recipe_types",
         ":backend",
+        ":partitioner",
+        ":quantizer",
         "//caffe2:torch",
         "//executorch/exir:lib",
         "//executorch/exir/backend:compile_spec_schema",
@@ -80,6 +85,20 @@ runtime.python_library(
     ],
 )
 
+runtime.python_library(
+    name = "coreml_recipe_types",
+    srcs = [
+        "recipes/coreml_recipe_types.py",
+    ],
+    visibility = [
+        "@EXECUTORCH_CLIENTS",
+        "//executorch/export/...",
+    ],
+    deps = [
+        "//executorch/export:recipe",
+    ],
+)
+
 runtime.cxx_python_extension(
     name = "executorchcoreml",
     srcs = [
@@ -124,7 +143,7 @@ runtime.python_test(
         "fbsource//third-party/pypi/pytest:pytest",
         ":partitioner",
         ":quantizer",
-        ":recipes",
+        ":coreml_recipes",
         "//caffe2:torch",
         "//pytorch/vision:torchvision",
         "fbsource//third-party/pypi/scikit-learn:scikit-learn",
 
@@ -15,6 +15,7 @@
 from coremltools.converters.mil.frontend.torch.ops import (
     _get_inputs,
     _get_kwinputs,
+    noop,
     NUM_TO_NUMPY_DTYPE,
     NUM_TO_TORCH_DTYPE,
     split,
@@ -91,6 +92,28 @@ def _to_dim_order_copy(context, node):
         to(context, node)
 
 
+@register_torch_op(
+    torch_alias=[
+        "dim_order_ops::_clone_dim_order",
+        "dim_order_ops._clone_dim_order",
+    ],
+    override=False,
+)
+def _clone_dim_order(context, node):
+    dim_order = _get_kwinputs(context, node, "dim_order", default=[None])[0]
+    node.kwinputs.pop("dim_order")
+
+    # In CoreML, dim_order.val will be a ndarray, so we convert it to a list to check memory format.
+    dim_order = [int(d) for d in dim_order.val]
+    memory_format = get_memory_format(dim_order)
+    assert (
+        memory_format == _torch.contiguous_format
+    ), "Only contiguous memory format is supported in CoreML"
+
+    # Since CoreML only supports contiguous format, no dim_order preservation is needed. Treat this as a no-op clone.
+    noop(context, node)
+
+
 # https://github.com/apple/coremltools/pull/2558
 @register_torch_op(
     torch_alias=["torchao::dequantize_affine", "torchao.dequantize_affine"],
@@ -175,11 +198,22 @@ def dequantize_codebook(context, node):
 
     # Assert codebook is as expected.  codebook.dim() = codes.dim() + 2
     assert len(codebook.shape) == 4, "Only rank 4 inputs are supported for codebook"
-    assert codebook.shape[0] == 1, "Only grouped_channel granularity is supported"
-    n_luts = codebook.shape[1]
-    assert (
-        codes.shape[1] % n_luts == 0
-    ), "codes.shape[1] must be divisible by codebook.shape[1]"
+    assert (codebook.shape[0] == 1) or (
+        codebook.shape[1] == 1
+    ), "Only grouped_channel granularity is supported"
+    if codebook.shape[0] == 1:
+        # LUT is per column group
+        n_luts = codebook.shape[1]
+        assert (
+            codes.shape[1] % n_luts == 0
+        ), "codes.shape[1] must be divisible by codebook.shape[1]"
+    else:
+        # LUT is per row group
+        n_luts = codebook.shape[0]
+        assert (
+            codes.shape[0] % n_luts == 0
+        ), "codes.shape[0] must be divisible by codebook.shape[0]"
+
     assert codebook.shape[2] == 2**nbits
     assert codebook.shape[3] == 1, "Only scalar look up values are supported"