pytorch · GregoryComer · Aug 27, 2025 · Aug 26, 2025 · Aug 26, 2025 · Aug 26, 2025
diff --git a/.github/workflows/build-presets.yml b/.github/workflows/build-presets.yml
@@ -109,7 +109,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        preset: [windows]
+        preset: [pybind, windows]
     with:
       job-name: build
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
@@ -122,14 +122,17 @@ jobs:
           Set-PSDebug -Trace 1
           \$ErrorActionPreference = 'Stop'
           \$PSNativeCommandUseErrorActionPreference = \$true
+
           conda create --yes --quiet -n et python=3.12
           conda activate et
           python install_requirements.py
+
           cmake --preset ${{ matrix.preset }} -T ClangCL
           if (\$LASTEXITCODE -ne 0) {
             Write-Host "CMake configuration was unsuccessful. Exit code: \$LASTEXITCODE."
             exit \$LASTEXITCODE
           }
+
           \$numCores = [System.Environment]::GetEnvironmentVariable('NUMBER_OF_PROCESSORS') - 1
           cmake --build cmake-out -j \$numCores
           if (\$LASTEXITCODE -ne 0) {

@@ -55,6 +55,14 @@ else()
   )
 endif()
 
+if(WIN32)
+  # These XNNPACK options don't currently build on Windows.
+  set_overridable_option(XNNPACK_ENABLE_AVX256SKX OFF)
+  set_overridable_option(XNNPACK_ENABLE_AVX256VNNI OFF)
+  set_overridable_option(XNNPACK_ENABLE_AVX256VNNIGFNI OFF)
+  set_overridable_option(XNNPACK_ENABLE_AVX512BF16 OFF)
+endif()
+
 set(XNNPACK_BUILD_ALL_MICROKERNELS
     OFF
     CACHE BOOL ""

@@ -24,6 +24,11 @@ if(NOT ET_HAVE_SYS_MMAN_H AND NOT WIN32)
        "extension/data_loader/mmap_data_loader.cpp"
   )
 endif()
+if(WIN32)
+  list(APPEND _extension_data_loader__srcs
+       "extension/data_loader/mman_windows.cpp"
+  )
+endif()
 list(TRANSFORM _extension_data_loader__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(extension_data_loader ${_extension_data_loader__srcs})
 target_link_libraries(extension_data_loader executorch_core)

diff --git a/kernels/portable/cpu/op_amax.cpp b/kernels/portable/cpu/op_amax.cpp
@@ -9,6 +9,7 @@
 #include <c10/util/irange.h>
 #include <cmath>
 
+#include <executorch/kernels/portable/cpu/util/math_util.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
@@ -55,7 +56,7 @@ Tensor& amax_out(
           for (const auto out_ix : c10::irange(begin, end)) {
             out_data[out_ix] = plan.execute<CTYPE>(
                 [](CTYPE v, CTYPE max_v) {
-                  return std::isnan(v) || v > max_v ? v : max_v;
+                  return utils::isnan_override(v) || v > max_v ? v : max_v;
                 },
                 out_ix);
           }

diff --git a/kernels/portable/cpu/op_amin.cpp b/kernels/portable/cpu/op_amin.cpp
@@ -8,6 +8,7 @@
 #include <c10/util/irange.h>
 #include <cmath>
 
+#include <executorch/kernels/portable/cpu/util/math_util.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
@@ -54,7 +55,7 @@ Tensor& amin_out(
           for (const auto out_ix : c10::irange(begin, end)) {
             out_data[out_ix] = plan.execute<CTYPE>(
                 [](CTYPE v, CTYPE min_v) {
-                  return std::isnan(v) || v < min_v ? v : min_v;
+                  return utils::isnan_override(v) || v < min_v ? v : min_v;
                 },
                 out_ix);
           }

diff --git a/kernels/portable/cpu/op_argmax.cpp b/kernels/portable/cpu/op_argmax.cpp
@@ -10,6 +10,7 @@
 #include <cmath>
 #include <tuple>
 
+#include <executorch/kernels/portable/cpu/util/math_util.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
@@ -58,7 +59,7 @@ Tensor& argmax_out(
                   // the below condition as written is equivalent to
                   // !isnan(accval) && (isnan(v) || v > acc_val). See
                   // argument in op_argmin.cpp.
-                  if (!std::isnan(acc_val) && !(v <= acc_val)) {
+                  if (!utils::isnan_override(acc_val) && !(v <= acc_val)) {
                     acc_val = v;
                     acc_ix = ix;
                   }

diff --git a/kernels/portable/cpu/op_argmin.cpp b/kernels/portable/cpu/op_argmin.cpp
@@ -10,6 +10,7 @@
 #include <cmath>
 #include <tuple>
 
+#include <executorch/kernels/portable/cpu/util/math_util.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
@@ -65,7 +66,7 @@ Tensor& argmin_out(
                   // - false, so the result is true. The result is trivially
                   // - true for the above condition that uses isnan(v) as
                   // - well.
-                  if (!std::isnan(acc_val) && !(v >= acc_val)) {
+                  if (!utils::isnan_override(acc_val) && !(v >= acc_val)) {
                     acc_val = v;
                     acc_ix = ix;
                   }

diff --git a/kernels/portable/cpu/op_max.cpp b/kernels/portable/cpu/op_max.cpp
@@ -10,6 +10,7 @@
 #include <cmath>
 #include <tuple>
 
+#include <executorch/kernels/portable/cpu/util/math_util.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
@@ -88,8 +89,8 @@ std::tuple<Tensor&, Tensor&> max_out(
               for (const auto out_ix : c10::irange(begin, end)) {
                 std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
                     [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
-                      if (!std::isnan(acc_val) &&
-                          (std::isnan(v) || v > acc_val)) {
+                      if (!utils::isnan_override(acc_val) &&
+                          (utils::isnan_override(v) || v > acc_val)) {
                         acc_val = v;
                         acc_ix = ix;
                       }
@@ -132,7 +133,7 @@ max_unary_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
       data_out[0] = lower_bound<CTYPE_OUT>();
       for (const auto i : c10::irange(in.numel())) {
         CTYPE_OUT val = static_cast<CTYPE_OUT>(data_in[i]);
-        if (std::isnan(val)) {
+        if (utils::isnan_override(val)) {
           data_out[0] = val;
           break;
         }

diff --git a/kernels/portable/cpu/op_min.cpp b/kernels/portable/cpu/op_min.cpp
@@ -10,6 +10,7 @@
 #include <cmath>
 #include <tuple>
 
+#include <executorch/kernels/portable/cpu/util/math_util.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
@@ -88,8 +89,8 @@ std::tuple<Tensor&, Tensor&> min_out(
               for (const auto out_ix : c10::irange(begin, end)) {
                 std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
                     [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
-                      if (!std::isnan(acc_val) &&
-                          (std::isnan(v) || v < acc_val)) {
+                      if (!utils::isnan_override(acc_val) &&
+                          (utils::isnan_override(v) || v < acc_val)) {
                         acc_val = v;
                         acc_ix = ix;
                       }
@@ -132,7 +133,7 @@ min_unary_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
       data_out[0] = upper_bound<CTYPE_OUT>();
       for (const auto i : c10::irange(in.numel())) {
         CTYPE_OUT val = static_cast<CTYPE_OUT>(data_in[i]);
-        if (std::isnan(val)) {
+        if (utils::isnan_override(val)) {
           data_out[0] = val;
           break;
         }

diff --git a/kernels/portable/cpu/op_relu.cpp b/kernels/portable/cpu/op_relu.cpp
@@ -9,6 +9,7 @@
 #include <cmath>
 
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/kernels/portable/cpu/util/math_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
@@ -45,7 +46,9 @@ Tensor& relu_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, "relu.out", CTYPE, [&]() {
     apply_unary_map_fn(
         [](const CTYPE val_in) {
-          return (std::isnan(val_in) || val_in >= CTYPE(0)) ? val_in : CTYPE(0);
+          return (utils::isnan_override(val_in) || val_in >= CTYPE(0))
+              ? val_in
+              : CTYPE(0);
         },
         in.const_data_ptr<CTYPE>(),
         out.mutable_data_ptr<CTYPE>(),

diff --git a/kernels/portable/cpu/op_sign.cpp b/kernels/portable/cpu/op_sign.cpp
@@ -10,6 +10,7 @@
 #include <cstring>
 
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/kernels/portable/cpu/util/math_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
@@ -42,7 +43,7 @@ Tensor& sign_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
     ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, "sign.out", CTYPE, [&] {
       apply_unary_map_fn(
           [](const CTYPE val_in) {
-            if (std::isnan(val_in)) {
+            if (utils::isnan_override(val_in)) {
               return val_in;
             } else {
               return static_cast<CTYPE>((val_in > 0) - (val_in < 0));

diff --git a/kernels/portable/cpu/op_topk.cpp b/kernels/portable/cpu/op_topk.cpp
@@ -10,6 +10,8 @@
 #include <cmath>
 #include <tuple>
 
+#include <executorch/kernels/portable/cpu/util/math_util.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 namespace torch {
@@ -62,7 +64,7 @@ bool float_less_than(T x, T y) {
   if constexpr (std::is_integral_v<T>) {
     return x < y;
   }
-  return (!std::isnan(x) && std::isnan(y)) || x < y;
+  return (!utils::isnan_override(x) && utils::isnan_override(y)) || x < y;
 }
 
 template <typename CTYPE, typename elem_t = std::pair<CTYPE, int64_t>>

diff --git a/kernels/portable/cpu/util/math_util.h b/kernels/portable/cpu/util/math_util.h
@@ -8,10 +8,14 @@
 
 #pragma once
 
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+
 #if defined(ET_USE_PYTORCH_HEADERS) && ET_USE_PYTORCH_HEADERS
 #include <ATen/cpu/vec/vec.h>
 #endif
 
+#include <type_traits>
+
 namespace torch {
 namespace executor {
 namespace native {
@@ -29,7 +33,8 @@ template <
     typename std::enable_if<std::is_integral<INT_T>::value, bool>::type = true>
 INT_T floor_divide(INT_T a, INT_T b) {
   const auto quot = a / b;
-  if (std::signbit(a) == std::signbit(b)) {
+  // MSVC does not like signbit on integral types.
+  if ((a < 0) == (b < 0)) {
     return quot;
   }
   const auto rem = a % b;
@@ -52,6 +57,20 @@ FLOAT_T floor_divide(FLOAT_T a, FLOAT_T b) {
   return div;
 }
 
+/**
+ * A wrapper around std::isnan that works with MSVC. When building with MSVC,
+ * std::isnan calls with integer inputs fail to compile due to ambiguous
+ * overload resolution.
+ */
+template <typename T>
+bool isnan_override(T a) {
+  if constexpr (!std::is_integral_v<T>) {
+    return std::isnan(a);
+  } else {
+    return false;
+  }
+}
+
 /**
  * Override min/max so we can emulate PyTorch's behavior with NaN entries.
  */

diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
@@ -279,6 +279,7 @@ ATEN_OPS = (
         deps = [
             "//executorch/runtime/core/exec_aten/util:scalar_type_util",
             "//executorch/runtime/core/exec_aten/util:tensor_util",
+            "//executorch/kernels/portable/cpu/util:math_util",
             "//executorch/kernels/portable/cpu/util:reduce_util",
         ],
     ),
@@ -288,6 +289,7 @@ ATEN_OPS = (
             "//executorch/runtime/core/exec_aten/util:scalar_type_util",
             "//executorch/runtime/core/exec_aten/util:tensor_util",
             "//executorch/kernels/portable/cpu/util:index_util",
+            "//executorch/kernels/portable/cpu/util:math_util",
             "//executorch/kernels/portable/cpu/util:reduce_util",
         ],
     ),
@@ -311,12 +313,14 @@ ATEN_OPS = (
     op_target(
         name = "op_argmax",
         deps = [
+            "//executorch/kernels/portable/cpu/util:math_util",
             "//executorch/kernels/portable/cpu/util:reduce_util",
         ],
     ),
     op_target(
         name = "op_argmin",
         deps = [
+            "//executorch/kernels/portable/cpu/util:math_util",
             "//executorch/kernels/portable/cpu/util:reduce_util",
         ],
     ),
@@ -839,6 +843,7 @@ ATEN_OPS = (
     op_target(
         name = "op_max",
         deps = [
+            "//executorch/kernels/portable/cpu/util:math_util",
             "//executorch/kernels/portable/cpu/util:reduce_util",
         ],
     ),
@@ -876,6 +881,7 @@ ATEN_OPS = (
     op_target(
         name = "op_min",
         deps = [
+            "//executorch/kernels/portable/cpu/util:math_util",
             "//executorch/kernels/portable/cpu/util:reduce_util",
         ],
     ),
@@ -1052,6 +1058,7 @@ ATEN_OPS = (
         name = "op_relu",
         deps = [
             "//executorch/kernels/portable/cpu/util:functional_util",
+            "//executorch/kernels/portable/cpu/util:math_util",
         ],
     ),
     op_target(
@@ -1162,6 +1169,7 @@ ATEN_OPS = (
         name = "op_sign",
         deps = [
             "//executorch/kernels/portable/cpu/util:functional_util",
+            "//executorch/kernels/portable/cpu/util:math_util",
         ],
     ),
     op_target(
@@ -1270,6 +1278,9 @@ ATEN_OPS = (
     ),
     op_target(
         name = "op_topk",
+        deps = [
+            "//executorch/kernels/portable/cpu/util:math_util",
+        ]
     ),
     op_target(
         name = "op_transpose_copy",

@@ -21,12 +21,13 @@ set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
-set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TRAINING ON)
 
 if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
   set_overridable_option(EXECUTORCH_BUILD_COREML ON)
+  set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TRAINING ON)
 elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
   set_overridable_option(EXECUTORCH_BUILD_COREML ON)
+  set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TRAINING ON)
 elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows" OR CMAKE_SYSTEM_NAME STREQUAL
                                                "WIN32"
 )

@@ -5,19 +5,13 @@
 # LICENSE file in the root directory of this source tree.
 
 # keep sorted
+set_overridable_option(EXECUTORCH_BUILD_EXECUTOR_RUNNER ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_EVALUE_UTIL ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
-
-# Below options are not yet buildable on Windows, but should be.
-set(EXECUTORCH_BUILD_PORTABLE_OPS
-    OFF
-    CACHE BOOL ""
-)
-# set_overridable_option(EXECUTORCH_BUILD_EXECUTOR_RUNNER ON)
-# set_overridable_option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED ON)
-# set_overridable_option(EXECUTORCH_BUILD_KERNELS_QUANTIZED ON)
-# set_overridable_option(EXECUTORCH_BUILD_XNNPACK ON)
+set_overridable_option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED ON)
+set_overridable_option(EXECUTORCH_BUILD_KERNELS_QUANTIZED ON)
+set_overridable_option(EXECUTORCH_BUILD_XNNPACK ON)