pytorch
diff --git a/‎.ci/scripts/setup-windows-msvc.ps1‎
Lines changed: 52 additions & 0 deletions b/‎.ci/scripts/setup-windows-msvc.ps1‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎.github/workflows/cuda.yml‎
Lines changed: 16 additions & 12 deletions b/‎.github/workflows/cuda.yml‎
Lines changed: 16 additions & 12 deletions
diff --git a/‎.github/workflows/windows-msvc.yml‎
Lines changed: 35 additions & 0 deletions b/‎.github/workflows/windows-msvc.yml‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎.mypy.ini‎
Lines changed: 6 additions & 0 deletions b/‎.mypy.ini‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎backends/apple/metal/metal_backend.py‎
Lines changed: 33 additions & 6 deletions b/‎backends/apple/metal/metal_backend.py‎
Lines changed: 33 additions & 6 deletions
diff --git a/‎backends/apple/metal/runtime/metal_backend.cpp‎
Lines changed: 26 additions & 0 deletions b/‎backends/apple/metal/runtime/metal_backend.cpp‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎backends/apple/metal/runtime/shims/et_metal.h‎
Lines changed: 1 addition & 0 deletions b/‎backends/apple/metal/runtime/shims/et_metal.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/apple/metal/runtime/shims/et_metal.mm‎
Lines changed: 24 additions & 5 deletions b/‎backends/apple/metal/runtime/shims/et_metal.mm‎
Lines changed: 24 additions & 5 deletions
@@ -0,0 +1,52 @@
+conda create --yes --quiet -n et python=3.12
+conda activate et
+
+# Install cmake
+conda install -y cmake
+
+# Activate the VS environment - this is required for MSVC to work
+# There are a bunch of environment variables that it requires.
+# See https://learn.microsoft.com/en-us/cpp/build/building-on-the-command-line.
+& "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools\Launch-VsDevShell.ps1" -Arch amd64
+
+# Install CI requirements
+pip install -r .ci/docker/requirements-ci.txt
+
+# Create build directory
+$buildDir = "cmake-out-msvc"
+if (Test-Path -Path $buildDir) {
+    Remove-Item -Path $buildDir -Recurse -Force
+}
+New-Item -Path $buildDir -ItemType Directory
+
+# Configure CMake with MSVC (not ClangCL) and disable custom/quantized ops
+cmake -S . -B $buildDir `
+    -DCMAKE_BUILD_TYPE=Release `
+    -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON `
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON `
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON `
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON `
+    -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON `
+    -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON `
+    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON `
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=OFF `
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM_AOT=OFF `
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=OFF `
+    -DEXECUTORCH_BUILD_XNNPACK=ON `
+    -DEXECUTORCH_BUILD_EXTENSION_LLM=ON `
+    -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON
+
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "CMake configuration failed. Exit code: $LASTEXITCODE."
+    exit $LASTEXITCODE
+}
+
+# Build with MSVC
+cmake --build $buildDir --config Release -j16
+
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "Build failed. Exit code: $LASTEXITCODE."
+    exit $LASTEXITCODE
+}
+
+Write-Host "MSVC build completed successfully!"
@@ -89,6 +89,8 @@ jobs:
 
   export-voxtral-cuda-artifact:
     name: export-voxtral-cuda-${{ matrix.quant.name }}
+    # Skip this job if the pull request is from a fork (HuggingFace secrets are not available)
+    if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request'
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
@@ -166,6 +168,8 @@ jobs:
 
   export-gemma3-cuda-artifact:
     name: export-gemma3-cuda-${{ matrix.quant.name }}
+    # Skip this job if the pull request is from a fork (HuggingFace secrets are not available)
+    if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request'
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
@@ -176,12 +180,12 @@ jobs:
       matrix:
         quant:
           - name: "non-quantized"
-            artifact: "voxtral-cuda-export"
+            artifact: "gemma3-cuda-export"
             extra_args: ""
-          # TODO: enable gemma3 quantization
-          # - name: "quantized-int4-tile-packed"
-          #   artifact: "voxtral-cuda-quantized-int4-tile-packed"
-          #   extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
+          - name: "quantized-int4-tile-packed"
+            artifact: "gemma3-cuda-quantized-int4-tile-packed"
+            extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
+          # TODO: enable int4-weight-only on gemma3.
           # - name: "quantized-int4-weight-only"
           #   artifact: "voxtral-cuda-quantized-int4-weight-only"
           #   # TODO: adding "--qlinear 4w" produces invalid results. Need further investigation.
@@ -194,7 +198,7 @@ jobs:
       gpu-arch-version: 12.6
       use-custom-docker-registry: false
       submodules: recursive
-      upload-artifact: gemma3-cuda-export
+      upload-artifact: ${{ matrix.quant.artifact }}
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         set -eux
@@ -255,7 +259,7 @@ jobs:
         set -eux
 
         echo "::group::Setup ExecuTorch Requirements"
-        CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
+        ./install_requirements.sh
         pip list
         echo "::endgroup::"
 
@@ -305,7 +309,7 @@ jobs:
         set -eux
 
         echo "::group::Setup ExecuTorch Requirements"
-        CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
+        ./install_requirements.sh
         pip list
         echo "::endgroup::"
 
@@ -363,7 +367,7 @@ jobs:
         set -eux
 
         echo "::group::Setup ExecuTorch Requirements"
-        CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
+        ./install_requirements.sh
         pip list
         echo "::endgroup::"
 
@@ -435,9 +439,9 @@ jobs:
         format:
           - name: "non-quantized"
             artifact: "gemma3-cuda-export"
-          # TODO: enable quantized gemma3.
-          # - name: "quantized-int4-tile-packed"
-          #   artifact: "gemma3-cuda-quantized-int4-tile-packed"
+          - name: "quantized-int4-tile-packed"
+            artifact: "gemma3-cuda-quantized-int4-tile-packed"
+          # TODO: enable int4-weight-only on gemma3.
           # - name: "quantized-int4-weight-only"
           #   artifact: "gemma3-cuda-quantized-int4-weight-only"
     with:
 
@@ -0,0 +1,35 @@
+name: Windows MSVC Build
+
+on:
+  push:
+    branches:
+      - main
+      - release/*
+    tags:
+      - ciflow/trunk/*
+  pull_request:
+    paths:
+      - .ci/docker/ci_commit_pins/pytorch.txt
+      - .ci/scripts/**
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+  build-windows-msvc:
+    name: build-windows-msvc
+    uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
+    with:
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 60
+      script: |
+        conda init powershell
+        powershell -Command "& {
+          Set-PSDebug -Trace 1
+          \$ErrorActionPreference = 'Stop'
+          \$PSNativeCommandUseErrorActionPreference = \$true
+          .ci/scripts/setup-windows-msvc.ps1
+        }"
@@ -83,6 +83,12 @@ ignore_missing_imports = True
 [mypy-tosa_tools.*]
 ignore_missing_imports = True
 
+[mypy-tosa_serializer]
+ignore_missing_imports = True
+
+[mypy-tosa_serializer.*]
+ignore_missing_imports = True
+
 [mypy-setuptools.*]
 ignore_missing_imports = True
 
 
@@ -29,7 +29,6 @@
 
 # exist fallback operators in et namespace;
 supported_fallback_kernels: Dict[str, Any] = {
-    "aoti_torch_mps_addmm_out": None,
     "aoti_torch_mps_convolution": None,
     "aoti_torch_mps_mm_out": None,
     "at::_ops::_scaled_dot_product_attention_math_for_mps::call": None,
@@ -108,34 +107,62 @@ def preprocess(
         options: dict[str, typing.Any] = {
             # Do not link against the full PyTorch/libtorch library
             "aot_inductor.link_libtorch": False,
-            # Package model constants and other generated files directly in the shared object (.so) file
-            "aot_inductor.package_constants_in_so": True,
+            # Separate weight constants from the .so file
+            "aot_inductor.package": True,
+            "aot_inductor.package_constants_in_so": False,
+            # Store weight constants on disk in a binary blob
+            "aot_inductor.package_constants_on_disk_format": "binary_blob",
             # Enable maximum automatic tuning for optimal performance
             "max_autotune": True,
             # "aot_inductor.debug_compile": True,
             # "aot_inductor.force_mmap_weights": False,
         }
 
         with collect_unsupported_fallback_kernels():
-            so_path = torch._inductor.aot_compile(edge_program_module, tuple(user_input_placeholders), options=options)  # type: ignore[arg-type]
+            paths = torch._inductor.aot_compile(edge_program_module, tuple(user_input_placeholders), options=options)  # type: ignore[arg-type]
             if len(missing_fallback_kernels) > 0:
                 formatted_kernels = "\n  - ".join(sorted(missing_fallback_kernels))
                 raise RuntimeError(
                     f"Missing fallback kernels ({len(missing_fallback_kernels)} total):\n  - {formatted_kernels}\n"
                     "Please add them to the AOTI backend."
                 )
 
+        # Extract the .so and .blob paths from the returned list
+        so_path = None
+        blob_path = None
+        for path in paths:
+            if path.endswith(".wrapper.so"):
+                so_path = path
+            elif path.endswith(".wrapper_weights.blob"):
+                blob_path = path
+
+        if so_path is None or blob_path is None:
+            raise RuntimeError(
+                f"Could not find required files in compiled paths, got {paths}"
+            )
+
         # pyre-ignorep[6]: Incompatible parameter type
         with open(so_path, "rb") as f:
             so_data = f.read()
 
         named_data_store = NamedDataStore()
         method_name = MetalBackend.method_name_from_compile_specs(compile_specs)
+
+        # Keep the so file in the NamedDataStore, so that it can be packaged into the .pte file.
+        named_data_store.add_named_data(method_name + "_so_blob", so_data, 1, None)
+
+        # Add weights blob to named data store
+        with open(blob_path, "rb") as f:
+            blob_data = f.read()
+
         named_data_store.add_named_data(
-            method_name + "_so_blob", so_data, 1, "aoti_metal_blob"
+            method_name + "_weights_blob", blob_data, 1, "aoti_metal_blob"
         )
 
-        # Clean up the generated so file; it has been packaged into the NamdeDataStore
+        # Clean up the weights blob file
+        os.remove(blob_path)
+
+        # Clean up the generated so file; it has been packaged into the NamedDataStore
         # pyre-ignorep[6]: Incompatible parameter type
         os.remove(so_path)
 
 
@@ -106,6 +106,15 @@ class ET_EXPERIMENTAL MetalBackend final
         Debug,
         "MetalBackend::load_function_pointers_into_handle - Loaded AOTInductorModelContainerRun");
 
+    LOAD_SYMBOL(
+        handle,
+        update_constants_from_blob,
+        AOTInductorModelUpdateConstantsFromBlob,
+        so_handle);
+    ET_LOG(
+        Debug,
+        "MetalBackend::load_function_pointers_into_handle - Loaded AOTInductorModelUpdateConstantsFromBlob");
+
     ET_LOG(
         Debug,
         "MetalBackend::load_function_pointers_into_handle - All symbols loaded successfully");
@@ -203,6 +212,9 @@ class ET_EXPERIMENTAL MetalBackend final
     outfile.close();
     ET_LOG(Info, "MetalBackend::init - File closed successfully");
 
+    // Free the buffer immediately after writing to disk
+    aoti_metal_buffer->Free();
+
     // Load the ELF using dlopen
     void* so_handle = dlopen(so_path.c_str(), RTLD_LAZY | RTLD_LOCAL);
     ET_CHECK_OR_RETURN_ERROR(
@@ -234,6 +246,20 @@ class ET_EXPERIMENTAL MetalBackend final
 
     handle->container_handle = container_handle;
 
+    // Look into named data map for constant data
+    std::string weights_blob_key =
+        method_name.empty() ? "weights_blob" : method_name + "_weights_blob";
+    auto buffer_res = named_data_map->get_data(weights_blob_key.c_str());
+    if (buffer_res.ok() && handle->update_constants_from_blob != nullptr) {
+      ET_LOG(Info, "Found %s in named data map", weights_blob_key.c_str());
+      const void* weights_blob = buffer_res->data();
+      // Feed the weights blob into the container. Under the hood it's copying
+      // weights, so we should free the buffer immediately.
+      ET_CHECK_OK_OR_RETURN_ERROR(handle->update_constants_from_blob(
+          handle->container_handle, static_cast<const uint8_t*>(weights_blob)));
+      buffer_res->Free();
+    }
+
     ET_LOG(Info, "MetalBackend::init - Initialization completed successfully");
     return (DelegateHandle*)handle; // Return the handle post-processing
   }
 
@@ -354,6 +354,7 @@ extern "C" {
 
 // Memory management functions for Metal
 void* metal_allocate_buffer(long bytes);
+void metal_deallocate_buffer(void* ptr);
 bool metal_is_device_pointer(void* ptr);
 int metal_copy_memory(
     void* dst,
 
@@ -86,6 +86,21 @@ void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)()) {
     }
 }
 
+void metal_deallocate_buffer(void* ptr) {
+    @autoreleasepool {
+        auto it = ptr_to_mtl_buffer.find(ptr);
+        if (it != ptr_to_mtl_buffer.end()) {
+            id<MTLBuffer> buffer = it->second;
+            [buffer release];
+            ptr_to_mtl_buffer.erase(it);
+            ET_LOG(Debug, "Deallocated Metal buffer for pointer %p", ptr);
+            ptr = nullptr;
+        } else {
+            ET_LOG(Error, "Failed to find Metal buffer for pointer %p", ptr);
+        }
+    }
+}
+
 void metal_cleanup_resources() {
     if (!ptr_to_mtl_buffer.empty()) {
         @autoreleasepool {
@@ -665,12 +680,16 @@ int metal_copy_memory(void* dst, const void* src, size_t nbytes, bool src_is_dev
 
 // Commit methods
 void ETMetalStream::commit() {
-    if (enableCommitAndContinue_ && commandBuffer_) {
-        // Use commit-and-continue for better performance
-        commitAndContinue();
-    } else {
-        flush();
+    if (!commandBuffer_) {
+        ET_LOG(Error, "ETMetalStream::commit: No command buffer to commit");
+        return;
     }
+
+    [commandBuffer_ commit];
+    ET_LOG(Debug, "ETMetalStream::commit: Committed buffer %p", commandBuffer_);
+
+    [commandBuffer_ release];
+    commandBuffer_ = nil;
 }
 
 void ETMetalStream::commitAndWait() {