pytorch
diff --git a/‎.ci/scripts/setup-windows.ps1‎
Lines changed: 24 additions & 0 deletions b/‎.ci/scripts/setup-windows.ps1‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎.ci/scripts/test_model.ps1‎
Lines changed: 89 additions & 0 deletions b/‎.ci/scripts/test_model.ps1‎
Lines changed: 89 additions & 0 deletions
diff --git a/‎.ci/scripts/unittest-windows.ps1‎
Lines changed: 15 additions & 0 deletions b/‎.ci/scripts/unittest-windows.ps1‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎.github/workflows/_unittest.yml‎
Lines changed: 21 additions & 0 deletions b/‎.github/workflows/_unittest.yml‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎.github/workflows/android-perf.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/android-perf.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 24 additions & 0 deletions b/‎.github/workflows/trunk.yml‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 0 deletions b/‎.gitignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/apple/coreml/compiler/torch_ops.py‎
Lines changed: 16 additions & 5 deletions b/‎backends/apple/coreml/compiler/torch_ops.py‎
Lines changed: 16 additions & 5 deletions
diff --git a/‎backends/apple/coreml/test/test_torch_ops.py‎
Lines changed: 62 additions & 5 deletions b/‎backends/apple/coreml/test/test_torch_ops.py‎
Lines changed: 62 additions & 5 deletions
diff --git a/‎backends/arm/arm_backend.py‎
Lines changed: 22 additions & 0 deletions b/‎backends/arm/arm_backend.py‎
Lines changed: 22 additions & 0 deletions
@@ -0,0 +1,24 @@
+param (
+    [string]$editable = $false
+)
+
+conda create --yes --quiet -n et python=3.12
+conda activate et
+
+# Activate the VS environment - this is required for Dynamo to work, as it uses MSVC.
+# There are a bunch of environment variables that it requires.
+# See https://learn.microsoft.com/en-us/cpp/build/building-on-the-command-line.
+& "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools\Launch-VsDevShell.ps1" -Arch amd64
+
+# Install test dependencies
+pip install -r .ci/docker/requirements-ci.txt
+
+if ($editable -eq 'true') {
+    install_executorch.bat --editable
+} else {
+    install_executorch.bat
+}
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "Installation was unsuccessful. Exit code: $LASTEXITCODE."
+    exit $LASTEXITCODE
+}
@@ -0,0 +1,89 @@
+param (
+    [string]$modelName,
+    [string]$backend,
+    [string]$buildDir = "cmake-out",
+    [bool]$strict = $false
+)
+
+Set-PSDebug -Trace 1
+$ErrorActionPreference = 'Stop'
+$PSNativeCommandUseErrorActionPreference = $true
+
+function ExportModel-Portable {
+    param (
+        [string]$model_name,
+        [bool]$strict
+    )
+
+    $exportParams = "--model_name", "$modelName"
+    if ($strict) {
+        $exportParams += "--strict"
+    }
+    python -m examples.portable.scripts.export @exportParams | Write-Host
+    if ($LASTEXITCODE -ne 0) {
+        Write-Host "Model export failed. Exit code: $LASTEXITCODE."
+        exit $LASTEXITCODE
+    }
+
+    "$modelName.pte"
+}
+
+function ExportModel-Xnnpack {
+    param (
+        [string]$model_name,
+        [bool]$quantize
+    )
+
+    if $(quantize) {
+        python -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate --quantize | Write-Host
+        $modelFile = "$($modelName)_xnnpack_q8.pte"
+    } else {
+        python -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate | Write-Host
+        $modelFile = "$($modelName)_xnnpack_fp32.pte"
+    }
+    if ($LASTEXITCODE -ne 0) {
+        Write-Host "Model export failed. Exit code: $LASTEXITCODE."
+        exit $LASTEXITCODE
+    }
+
+    $modelFile
+}
+
+# Build the runner
+if (Test-Path -Path $buildDir) {
+    Remove-Item -Path $buildDir -Recurse -Force
+}
+New-Item -Path $buildDir -ItemType Directory
+Push-Location $buildDir
+cmake .. --preset windows
+cmake --build . -t executor_runner -j16 --config Release
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "Runner build failed. Exit code: $LASTEXITCODE."
+    exit $LASTEXITCODE
+}
+$executorBinaryPath = Join-Path -Path $buildDir -ChildPath "Release\executor_runner.exe"
+Pop-Location
+
+# Export the model
+switch ($backend) {
+    "portable" {
+        $model_path = ExportModel-Portable -model_name $modelName -strict $strict
+    }
+    "xnnpack-f32" {
+        $model_path = ExportModel-Xnnpack -model_name $modelName -quantize $false
+    }
+    "xnnpack-q8" {
+        $model_path = ExportModel-Xnnpack -model_name $modelName -quantize $true
+    }
+    default {
+        Write-Host "Unknown backend $backend."
+        exit 1
+    }
+}
+
+# Run the runner
+& "$executorBinaryPath" --model_path="$model_path"
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "Model execution failed. Exit code: $LASTEXITCODE."
+    exit $LASTEXITCODE
+}
@@ -0,0 +1,15 @@
+param (
+    [string]$editable = $false
+)
+
+Set-PSDebug -Trace 1
+$ErrorActionPreference = 'Stop'
+$PSNativeCommandUseErrorActionPreference = $true
+
+# Run pytest with coverage
+# pytest -n auto --cov=./ --cov-report=xml
+pytest -v --full-trace -c pytest-windows.ini
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "Pytest invocation was unsuccessful. Exit code: $LASTEXITCODE."
+    exit $LASTEXITCODE
+}
@@ -19,6 +19,7 @@ on:
         required: false
         type: string
         description: Install ExecuTorch in editable mode or not.
+        default: 'false'
       python-version:
         required: false
         type: string
@@ -52,3 +53,23 @@ jobs:
         # This is needed to get the prebuilt PyTorch wheel from S3
         ${CONDA_RUN} --no-capture-output pip install awscli==1.37.21
         .ci/scripts/unittest-macos.sh --build-tool "${{ inputs.build-tool }}" --build-mode "${{ inputs.build-mode }}" --editable "${{ inputs.editable }}"
+
+  windows:
+    if: ${{ inputs.build-tool == 'cmake' }}
+    uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
+    with:
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 120
+      script: |
+        conda init powershell
+
+        powershell -Command "& {
+          Set-PSDebug -Trace 1
+          \$ErrorActionPreference = 'Stop'
+          \$PSNativeCommandUseErrorActionPreference = \$true
+
+          .ci/scripts/setup-windows.ps1       
+
+          powershell .ci/scripts/unittest-windows.ps1 -editable "${{ inputs.editable }}"
+        }"
@@ -72,7 +72,7 @@ jobs:
           # Separate default values from the workflow dispatch. To ensure defaults are accessible
           # during scheduled runs and to provide flexibility for different defaults between
           # on-demand and periodic benchmarking.
-          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'Qwen/Qwen3-0.6B' }}
+          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'Qwen/Qwen3-0.6B' }}
           CRON_DEFAULT_DEVICES: samsung_galaxy_s22+public
         run: |
           set -eux
 
@@ -979,3 +979,27 @@ jobs:
         # Run MCU models
         chmod +x examples/arm/run_mcu_models_fvp.sh
         examples/arm/run_mcu_models_fvp.sh --target=cortex-m55
+
+  test-models-windows:
+    uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
+    strategy:
+      fail-fast: false
+      matrix:
+        model: [linear, add, add_mul, ic3, ic4, mv2, mv3, resnet18, resnet50, vit, w2l, mobilebert, emformer_join, emformer_transcribe]
+        backend: [portable, xnnpack-f32, xnnpack-q8]
+    with:
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 60
+      script: |
+        conda init powershell
+        
+        powershell -Command "& {
+          Set-PSDebug -Trace 1
+          \$ErrorActionPreference = 'Stop'
+          \$PSNativeCommandUseErrorActionPreference = \$true
+
+          .ci/scripts/setup-windows.ps1       
+
+          powershell .ci/scripts/test_model.ps1 -modelName ${{ matrix.model }} -backend ${{ matrix.backend }}
+        }"
@@ -65,3 +65,7 @@ xcuserdata/
 
 # Android
 *.aar
+
+# Windows
+*.dll
+*.pyd
@@ -198,11 +198,22 @@ def dequantize_codebook(context, node):
 
     # Assert codebook is as expected.  codebook.dim() = codes.dim() + 2
     assert len(codebook.shape) == 4, "Only rank 4 inputs are supported for codebook"
-    assert codebook.shape[0] == 1, "Only grouped_channel granularity is supported"
-    n_luts = codebook.shape[1]
-    assert (
-        codes.shape[1] % n_luts == 0
-    ), "codes.shape[1] must be divisible by codebook.shape[1]"
+    assert (codebook.shape[0] == 1) or (
+        codebook.shape[1] == 1
+    ), "Only grouped_channel granularity is supported"
+    if codebook.shape[0] == 1:
+        # LUT is per column group
+        n_luts = codebook.shape[1]
+        assert (
+            codes.shape[1] % n_luts == 0
+        ), "codes.shape[1] must be divisible by codebook.shape[1]"
+    else:
+        # LUT is per row group
+        n_luts = codebook.shape[0]
+        assert (
+            codes.shape[0] % n_luts == 0
+        ), "codes.shape[0] must be divisible by codebook.shape[0]"
+
     assert codebook.shape[2] == 2**nbits
     assert codebook.shape[3] == 1, "Only scalar look up values are supported"
 
 
@@ -35,7 +35,7 @@ def _coreml_partitioner(self):
 
     def _get_test_model(self):
         model = torch.nn.Sequential(
-            torch.nn.Embedding(64, 128), torch.nn.Linear(128, 128), torch.nn.ReLU()
+            torch.nn.Embedding(64, 128), torch.nn.Linear(128, 256), torch.nn.ReLU()
         )
         example_inputs = (torch.LongTensor([0]),)
         return model, example_inputs
@@ -158,7 +158,7 @@ def test_dequantize_affine_c8w_embedding_b4w_linear(self):
         et_prog = delegated_program.to_executorch()
         self._compare_outputs(et_prog, model, example_inputs)
 
-    def test_dequantize_codebook_linear(self):
+    def test_dequantize_codebook_linear_per_grouped_col(self):
         model, example_inputs = self._get_test_model()
         quantize_(
             model,
@@ -185,7 +185,34 @@ def test_dequantize_codebook_linear(self):
         et_prog = delegated_program.to_executorch()
         self._compare_outputs(et_prog, model, example_inputs)
 
-    def test_dequantize_codebook_embedding(self):
+    def test_dequantize_codebook_linear_per_grouped_row(self):
+        model, example_inputs = self._get_test_model()
+        quantize_(
+            model,
+            CodebookWeightOnlyConfig(dtype=torch.uint2, block_size=[16, -1]),
+        )
+        ep = torch.export.export(model, example_inputs)
+        assert "torch.ops.quant.dequantize_codebook.default" in ep.graph_module.code
+        delegated_program = executorch.exir.to_edge_transform_and_lower(
+            ep,
+            partitioner=[self._coreml_partitioner()],
+        )
+        for node in delegated_program.exported_program().graph.nodes:
+            if node.op == "call_function":
+                assert node.target.__name__ in [
+                    "executorch_call_delegate",
+                    "getitem",
+                ], f"Got unexpected node target after delegation: {node.target.__name__}"
+
+        assert (
+            "executorch.exir.dialects.edge._ops.quant.dequantize_codebook.default"
+            in format_delegated_graph(delegated_program.exported_program().graph_module)
+        )
+
+        et_prog = delegated_program.to_executorch()
+        self._compare_outputs(et_prog, model, example_inputs)
+
+    def test_dequantize_codebook_embedding_per_grouped_col(self):
         model, example_inputs = self._get_test_model()
         quantize_(
             model,
@@ -212,7 +239,35 @@ def test_dequantize_codebook_embedding(self):
 
         et_prog = delegated_program.to_executorch()
         self._compare_outputs(et_prog, model, example_inputs)
+    
+    def test_dequantize_codebook_embedding_per_grouped_row(self):
+        model, example_inputs = self._get_test_model()
+        quantize_(
+            model,
+            CodebookWeightOnlyConfig(dtype=torch.uint3, block_size=[16, -1]),
+            lambda m, fqn: isinstance(m, torch.nn.Embedding),
+        )
+        ep = torch.export.export(model, example_inputs)
+        assert "torch.ops.quant.dequantize_codebook.default" in ep.graph_module.code
+        delegated_program = executorch.exir.to_edge_transform_and_lower(
+            ep,
+            partitioner=[self._coreml_partitioner()],
+        )
+        for node in delegated_program.exported_program().graph.nodes:
+            if node.op == "call_function":
+                assert node.target.__name__ in [
+                    "executorch_call_delegate",
+                    "getitem",
+                ], f"Got unexpected node target after delegation: {node.target.__name__}"
+
+        assert (
+            "executorch.exir.dialects.edge._ops.quant.dequantize_codebook.default"
+            in format_delegated_graph(delegated_program.exported_program().graph_module)
+        )
 
+        et_prog = delegated_program.to_executorch()
+        self._compare_outputs(et_prog, model, example_inputs)
+        
     def test__clone_dim_order_contiguous(self):
         class Model(torch.nn.Module):
             def forward(self, x):
@@ -243,6 +298,8 @@ def forward(self, x):
     test_runner.test_dequantize_affine_c4w_embedding()
     test_runner.test_dequantize_affine_c4w_linear()
     test_runner.test_dequantize_affine_c8w_embedding_b4w_linear()
-    test_runner.test_dequantize_codebook_linear()
-    test_runner.test_dequantize_codebook_embedding()
+    test_runner.test_dequantize_codebook_linear_per_grouped_col()
+    test_runner.test_dequantize_codebook_linear_per_grouped_row()
+    test_runner.test_dequantize_codebook_embedding_per_grouped_col()
+    test_runner.test_dequantize_codebook_embedding_per_grouped_row()
     test_runner.test__clone_dim_order_contiguous()
@@ -10,6 +10,7 @@
 # backends. Converts via TOSA as an intermediate form supported by AoT and
 # JIT compiler flows.
 #
+from enum import Enum
 from typing import List, Optional
 
 from executorch.backends.arm.tosa_specification import (  # type: ignore[import-not-found]
@@ -22,12 +23,16 @@
 
 
 class ArmCompileSpecBuilder:
+    class DebugMode(Enum):
+        JSON = 1
+
     def __init__(self):
         self.compile_spec: List[CompileSpec] = []
         self.compiler_flags = []
         self.output_format = None
         self.path_for_intermediates = None
         self.tosa_spec = None
+        self.tosa_debug_mode = None
 
     def vgf_compile_spec(
         self,
@@ -163,6 +168,13 @@ def dump_intermediate_artifacts_to(
         self.path_for_intermediates = output_path
         return self
 
+    def dump_debug_info(self, debug_mode: DebugMode) -> "ArmCompileSpecBuilder":
+        """
+        Dump debugging information into the intermediates path
+        """
+        self.tosa_debug_mode = debug_mode.name
+        return self
+
     def build(self) -> List[CompileSpec]:
         """
         Generate a list of compile spec objects from the builder
@@ -188,6 +200,16 @@ def build(self) -> List[CompileSpec]:
                 CompileSpec("debug_artifact_path", self.path_for_intermediates.encode())
             )
 
+        if self.tosa_debug_mode is not None:
+            if not self.path_for_intermediates:
+                raise ValueError(
+                    "dump_debug_info() must be used in conjunction with dump_intermediate_artifacts_to()"
+                )
+
+            self.compile_spec.append(
+                CompileSpec("dump_debug_info", self.tosa_debug_mode.encode())
+            )
+
         return self.compile_spec