Introducing a way to builder kernels for Windows platforms (#250)

mfuntowicz · web-flow · commit 73476d36a330 · 2025-10-16T20:07:54.000+02:00
This PR attempts to replicate what is being done with NixOS on Windows
platform through a PowerShell scripts `kbuild.ps1`.

It leverages `build2cmake` as much as possible and attempts to move some
parts currently handle in Nix directly in CMake to avoid dupplication as
much as possible (when feasible).

It also introduce two CMake functions to automatically generate the
layout required by `kernels::get_local_kernel` and hub based build, so
it easy to debug and publish.
diff --git a/.github/workflows/build_kernel_windows.yaml b/.github/workflows/build_kernel_windows.yaml
@@ -0,0 +1,86 @@
+name: "Build and test kernel - Windows"
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+    types: [opened, synchronize, reopened] # trigger on PRs
+  workflow_dispatch:
+
+jobs:
+  build:
+    strategy:
+      matrix:
+        os: [ windows-2022 ]
+        python: [ '3.12', '3.13' ]
+        torch: [
+          { version: '2.8', cuda: '12.9.1', wheel: '129' }
+        ]
+
+    name: Build kernel
+    runs-on: ${{ matrix.os }}
+
+    steps:
+      - uses: actions/cache@v4
+        with:
+          key: cuda-toolkit-v${{ matrix.cuda }}-${{ matrix.os }}
+          path: |
+            C:\Program Files\NVIDIA GPU Computing Toolkit
+            ~/.cargo/registry
+            ~/.cargo/git
+
+      - uses: actions/checkout@v5
+
+      # CUDA environment setup
+      - uses: N-Storm/cuda-toolkit@v0.2.28
+        id: setup-cuda-toolkit
+        with:
+          cuda: ${{ matrix.torch.cuda }}  # TODO(mfuntowicz): How can we test multiple CUDA versions than align with torch?
+      - name: "NVCC checks"
+        run: nvcc -V
+
+      # Rust build environment setup
+      - uses: actions-rs/toolchain@v1
+        with:
+          toolchain: stable
+          profile: minimal
+          override: true
+
+      - name: Build build2cmake
+        run: ( cd build2cmake && cargo build --release )
+
+      # Python environment setup
+      - uses: actions/setup-python@v6
+        with:
+          python-version: ${{ matrix.python }}
+          cache: 'pip'
+
+      - name: Install PyTorch
+        run: pip install torch --index-url https://download.pytorch.org/whl/cu129
+
+      - name: Build activation kernel
+        run: ( scripts\windows\builder.ps1 -SourceFolder examples/activation -BuildConfig Release -Backend cuda -Build -Force )
+#      - name: Copy activation kernel
+#        run: cp -rL examples/activation/build activation-kernel
+
+      - name: Build cutlass GEMM kernel
+        run: ( scripts\windows\builder.ps1 -SourceFolder examples/cutlass-gemm -BuildConfig Release -Backend cuda -Build -Force )
+#      - name: Copy cutlass GEMM kernel
+#        run: cp -rL examples/cutlass-gemm/result cutlass-gemm-kernel
+
+      - name: Build relu kernel
+        run: ( scripts\windows\builder.ps1 -SourceFolder examples/relu -BuildConfig Release -Backend cuda -Build -Force )
+#      - name: Copy relu kernel
+#        run: cp -rL examples/relu/result relu-kernel
+
+      - name: Build relu-backprop-compile kernel
+        run: ( scripts\windows\builder.ps1 -SourceFolder examples/relu-backprop-compile -BuildConfig Release -Backend cuda -Build -Force  )
+#      - name: Copy relu-backprop-compile kernel
+#        run: cp -rL examples/relu-backprop-compile/result relu-backprop-compile-kernel
+
+      # Just test that we build with the extra torchVersions argument.
+#      - name: Build relu kernel (specific Torch version)
+#        run: ( cd examples/relu-specific-torch && nix build . )
+
+      - name: Build silu-and-mul-universal kernel
+        run: ( scripts\windows\builder.ps1 -SourceFolder examples/silu-and-mul-universal -BuildConfig Release -Build -Force)
diff --git a/build2cmake/src/templates/cuda/preamble.cmake b/build2cmake/src/templates/cuda/preamble.cmake
@@ -98,3 +98,23 @@ else()
     ${GPU_LANG}
     "${${GPU_LANG}_SUPPORTED_ARCHS}")
 endif()
+
+
+message(STATUS "Rendered for platform {{ platform }}")
+{% if platform == 'windows' %}
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/windows.cmake)
+
+# Generate standardized build name
+run_python(TORCH_VERSION "import torch; print(torch.__version__.split('+')[0])" "Failed to get Torch version")
+run_python(CXX11_ABI_VALUE "import torch; print('TRUE' if torch._C._GLIBCXX_USE_CXX11_ABI else 'FALSE')" "Failed to get CXX11 ABI")
+cmake_host_system_information(RESULT HOST_ARCH QUERY OS_PLATFORM)
+
+set(SYSTEM_STRING "${HOST_ARCH}-windows")
+
+if(GPU_LANG STREQUAL "CUDA")
+  generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" ${CXX11_ABI_VALUE} "cuda" "${CUDA_VERSION}" "${SYSTEM_STRING}")
+elseif(GPU_LANG STREQUAL "HIP")
+  run_python(ROCM_VERSION "import torch.version; print(torch.version.hip.split('.')[0] + '.' + torch.version.hip.split('.')[1])" "Failed to get ROCm version")
+  generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" ${CXX11_ABI_VALUE} "rocm" "${ROCM_VERSION}" "${SYSTEM_STRING}")
+endif()
+{% endif %}
diff --git a/build2cmake/src/templates/cuda/torch-extension.cmake b/build2cmake/src/templates/cuda/torch-extension.cmake
@@ -9,5 +9,17 @@ define_gpu_extension_target(
   USE_SABI 3
   WITH_SOABI)
 
-target_link_options({{ ops_name }} PRIVATE -static-libstdc++)
+if( NOT MSVC)
+    target_link_options({{ ops_name }} PRIVATE -static-libstdc++)
+endif()
 
+{% if platform == 'windows' %}
+# These methods below should be included from preamble.cmake on windows platform.
+
+# Add kernels_install target for huggingface/kernels library layout
+add_kernels_install_target({{ ops_name }} "{{ name }}" "${BUILD_VARIANT_NAME}")
+
+# Add local_install target for local development with get_local_kernel()
+add_local_install_target({{ ops_name }} "{{ name }}" "${BUILD_VARIANT_NAME}")
+
+{% endif %}
diff --git a/build2cmake/src/templates/windows.cmake b/build2cmake/src/templates/windows.cmake
@@ -0,0 +1,176 @@
+# Generate a standardized build variant name following the pattern:
+# torch<VERSION>-<ABI>-<COMPUTE>-windows
+#
+# Arguments:
+#   OUT_BUILD_NAME - Output variable name
+#   TORCH_VERSION - PyTorch version (e.g., "2.7.1")
+#   CXX11_ABI - Whether C++11 ABI is enabled (TRUE/FALSE)
+#   COMPUTE_FRAMEWORK - One of: cuda, rocm, metal, xpu
+#   COMPUTE_VERSION - Version of compute framework (e.g., "12.4" for CUDA, "6.0" for ROCm)
+# Example output: torch271-cxx11-cu124-x86_64-windows
+#
+function(generate_build_name OUT_BUILD_NAME TORCH_VERSION CXX11_ABI COMPUTE_FRAMEWORK COMPUTE_VERSION)
+    # Flatten version by removing dots and padding to 2 components
+    string(REPLACE "." ";" VERSION_LIST "${TORCH_VERSION}")
+    list(LENGTH VERSION_LIST VERSION_COMPONENTS)
+
+    # Pad to at least 2 components
+    if(VERSION_COMPONENTS LESS 2)
+        list(APPEND VERSION_LIST "0")
+    endif()
+
+    # Take first 2 components and join without dots
+    list(GET VERSION_LIST 0 MAJOR)
+    list(GET VERSION_LIST 1 MINOR)
+    set(FLATTENED_TORCH "${MAJOR}${MINOR}")
+
+    # Generate compute string
+    if(COMPUTE_FRAMEWORK STREQUAL "cuda")
+        # Flatten CUDA version (e.g., "12.4" -> "124")
+        string(REPLACE "." ";" COMPUTE_VERSION_LIST "${COMPUTE_VERSION}")
+        list(LENGTH COMPUTE_VERSION_LIST COMPUTE_COMPONENTS)
+        if(COMPUTE_COMPONENTS GREATER_EQUAL 2)
+            list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
+            list(GET COMPUTE_VERSION_LIST 1 COMPUTE_MINOR)
+            set(COMPUTE_STRING "cu${COMPUTE_MAJOR}${COMPUTE_MINOR}")
+        else()
+            list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
+            set(COMPUTE_STRING "cu${COMPUTE_MAJOR}0")
+        endif()
+    elseif(COMPUTE_FRAMEWORK STREQUAL "rocm")
+        # Flatten ROCm version (e.g., "6.0" -> "60")
+        string(REPLACE "." ";" COMPUTE_VERSION_LIST "${COMPUTE_VERSION}")
+        list(LENGTH COMPUTE_VERSION_LIST COMPUTE_COMPONENTS)
+        if(COMPUTE_COMPONENTS GREATER_EQUAL 2)
+            list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
+            list(GET COMPUTE_VERSION_LIST 1 COMPUTE_MINOR)
+            set(COMPUTE_STRING "rocm${COMPUTE_MAJOR}${COMPUTE_MINOR}")
+        else()
+            list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
+            set(COMPUTE_STRING "rocm${COMPUTE_MAJOR}0")
+        endif()
+    elseif(COMPUTE_FRAMEWORK STREQUAL "xpu")
+        # Flatten XPU version (e.g., "2025.2" -> "202552")
+        string(REPLACE "." ";" COMPUTE_VERSION_LIST "${COMPUTE_VERSION}")
+        list(LENGTH COMPUTE_VERSION_LIST COMPUTE_COMPONENTS)
+        if(COMPUTE_COMPONENTS GREATER_EQUAL 2)
+            list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
+            list(GET COMPUTE_VERSION_LIST 1 COMPUTE_MINOR)
+            set(COMPUTE_STRING "xpu${COMPUTE_MAJOR}${COMPUTE_MINOR}")
+        else()
+            list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
+            set(COMPUTE_STRING "xpu${COMPUTE_MAJOR}0")
+        endif()
+    else()
+        message(FATAL_ERROR "Unknown compute framework: ${COMPUTE_FRAMEWORK}")
+    endif()
+
+    # Assemble the final build name
+    if(ABI_STRING STREQUAL "")
+        set(BUILD_NAME "torch${FLATTENED_TORCH}-${COMPUTE_STRING}-windows")
+    else()
+        set(BUILD_NAME "torch${FLATTENED_TORCH}-${ABI_STRING}-${COMPUTE_STRING}-windows")
+    endif()
+
+    set(${OUT_BUILD_NAME} "${BUILD_NAME}" PARENT_SCOPE)
+    message(STATUS "Generated build name: ${BUILD_NAME}")
+endfunction()
+
+#
+# Create a custom install target for the huggingface/kernels library layout.
+# This installs the extension into a directory structure suitable for kernel hub discovery:
+#   <PREFIX>/<BUILD_VARIANT_NAME>/<PACKAGE_NAME>/
+#
+# Arguments:
+#   TARGET_NAME - Name of the target to create the install rule for
+#   PACKAGE_NAME - Python package name (e.g., "activation")
+#   BUILD_VARIANT_NAME - Build variant name (e.g., "torch271-cxx11-cu124-x86_64-linux")
+#   INSTALL_PREFIX - Base installation directory (defaults to CMAKE_INSTALL_PREFIX)
+#
+function(add_kernels_install_target TARGET_NAME PACKAGE_NAME BUILD_VARIANT_NAME)
+    set(oneValueArgs INSTALL_PREFIX)
+    cmake_parse_arguments(ARG "" "${oneValueArgs}" "" ${ARGN})
+
+    if(NOT ARG_INSTALL_PREFIX)
+        set(ARG_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+    endif()
+
+    # Create the kernels_install target if it doesn't exist
+    if(NOT TARGET kernels_install)
+        add_custom_target(kernels_install ALL
+                COMMENT "Installing all kernels to hub-compatible layout"
+                VERBATIM)
+    endif()
+
+    # Create a custom target for this specific kernel
+    set(KERNEL_INSTALL_TARGET "${TARGET_NAME}_kernel_install")
+    set(KERNEL_INSTALL_DIR "${ARG_INSTALL_PREFIX}/${BUILD_VARIANT_NAME}/${PACKAGE_NAME}")
+
+    add_custom_target(${KERNEL_INSTALL_TARGET} ALL
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${KERNEL_INSTALL_DIR}"
+            COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:${TARGET_NAME}> "${KERNEL_INSTALL_DIR}/"
+            COMMAND ${CMAKE_COMMAND} -E copy_directory
+            "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}"
+            "${KERNEL_INSTALL_DIR}/"
+            DEPENDS ${TARGET_NAME}
+            COMMENT "Installing ${TARGET_NAME} to ${KERNEL_INSTALL_DIR}"
+            VERBATIM)
+
+    # Make kernels_install depend on this specific kernel's install
+    add_dependencies(kernels_install ${KERNEL_INSTALL_TARGET})
+
+    # Set folder for IDE organization
+    if(MSVC OR XCODE)
+        set_target_properties(${KERNEL_INSTALL_TARGET} PROPERTIES FOLDER "Install")
+    endif()
+
+    message(STATUS "Added kernels_install target for ${TARGET_NAME} -> ${BUILD_VARIANT_NAME}/${PACKAGE_NAME}")
+endfunction()
+
+#
+# Add install rules for local development with huggingface/kernels.
+# This installs the extension into the layout expected by get_local_kernel():
+#   ${CMAKE_SOURCE_DIR}/build/<BUILD_VARIANT_NAME>/<PACKAGE_NAME>/
+#
+# This allows developers to use get_local_kernel() from the kernels library to load
+# locally built kernels without needing to publish to the hub.
+#
+# This uses the standard CMake install() command, so it works with the default
+# "install" target that is always available.
+#
+# Arguments:
+#   TARGET_NAME - Name of the target to create the install rule for
+#   PACKAGE_NAME - Python package name (e.g., "activation")
+#   BUILD_VARIANT_NAME - Build variant name (e.g., "torch271-cxx11-cu124-x86_64-linux")
+#
+function(add_local_install_target TARGET_NAME PACKAGE_NAME BUILD_VARIANT_NAME)
+    # Define your local, folder based, installation directory
+    set(LOCAL_INSTALL_DIR "${CMAKE_SOURCE_DIR}/build/${BUILD_VARIANT_NAME}/${PACKAGE_NAME}")
+
+    # Glob Python files at configure time
+    file(GLOB PYTHON_FILES "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/*.py")
+
+    # Create a custom target for local installation
+    add_custom_target(local_install
+            COMMENT "Installing files to local directory..."
+    )
+
+    # Add custom commands to copy files
+    add_custom_command(TARGET local_install POST_BUILD
+            # Copy the shared library
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different
+            $<TARGET_FILE:${TARGET_NAME}>
+            ${LOCAL_INSTALL_DIR}/
+
+            # Copy each Python file
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different
+            ${PYTHON_FILES}
+            ${LOCAL_INSTALL_DIR}/
+
+            COMMENT "Copying shared library and Python files to ${LOCAL_INSTALL_DIR}"
+            COMMAND_EXPAND_LISTS
+    )
+
+    file(MAKE_DIRECTORY ${LOCAL_INSTALL_DIR})
+    message(STATUS "Added install rules for ${TARGET_NAME} -> build/${BUILD_VARIANT_NAME}/${PACKAGE_NAME}")
+endfunction()
diff --git a/build2cmake/src/torch/cuda.rs b/build2cmake/src/torch/cuda.rs
@@ -1,4 +1,5 @@
 use std::collections::HashSet;
+use std::env;
 use std::io::Write;
 use std::path::PathBuf;
 
@@ -12,6 +13,7 @@ use crate::version::Version;
 use crate::FileSet;
 
 static CMAKE_UTILS: &str = include_str!("../templates/utils.cmake");
+static WINDOWS_UTILS: &str = include_str!("../templates/windows.cmake");
 static REGISTRATION_H: &str = include_str!("../templates/registration.h");
 static HIPIFY: &str = include_str!("../templates/cuda/hipify.py");
 static CUDA_SUPPORTED_ARCHS_JSON: &str = include_str!("../cuda_supported_archs.json");
@@ -155,6 +157,13 @@ fn write_cmake(
         .entry(utils_path.clone())
         .extend_from_slice(CMAKE_UTILS.as_bytes());
 
+    let mut windows_utils_path = PathBuf::new();
+    windows_utils_path.push("cmake");
+    windows_utils_path.push("windows.cmake");
+    file_set
+        .entry(windows_utils_path.clone())
+        .extend_from_slice(WINDOWS_UTILS.as_bytes());
+
     let mut hipify_path = PathBuf::new();
     hipify_path.push("cmake");
     hipify_path.push("hipify.py");
@@ -184,7 +193,7 @@ fn write_cmake(
         render_kernel(env, kernel_name, kernel, cmake_writer)?;
     }
 
-    render_extension(env, ops_name, cmake_writer)?;
+    render_extension(env, name, ops_name, cmake_writer)?;
 
     Ok(())
 }
@@ -351,11 +360,17 @@ pub fn render_kernel(
     Ok(())
 }
 
-pub fn render_extension(env: &Environment, ops_name: &str, write: &mut impl Write) -> Result<()> {
+pub fn render_extension(
+    env: &Environment,
+    name: &str,
+    ops_name: &str,
+    write: &mut impl Write,
+) -> Result<()> {
     env.get_template("cuda/torch-extension.cmake")
         .wrap_err("Cannot get Torch extension template")?
         .render_to_write(
             context! {
+                name => name,
                 ops_name => ops_name,
             },
             &mut *write,
@@ -382,7 +397,7 @@ pub fn render_preamble(
                 cuda_minver => cuda_minver.map(|v| v.to_string()),
                 cuda_maxver => cuda_maxver.map(|v| v.to_string()),
                 cuda_supported_archs => cuda_supported_archs(),
-
+                platform => env::consts::OS
             },
             &mut *write,
         )
diff --git a/build2cmake/src/torch/xpu.rs b/build2cmake/src/torch/xpu.rs
diff --git a/scripts/windows/builder.ps1 b/scripts/windows/builder.ps1