Skip to content
This repository was archived by the owner on Jan 27, 2026. It is now read-only.

Commit 73476d3

Browse files
authored
Introducing a way to builder kernels for Windows platforms (#250)
This PR attempts to replicate what is being done with NixOS on Windows platform through a PowerShell scripts `kbuild.ps1`. It leverages `build2cmake` as much as possible and attempts to move some parts currently handle in Nix directly in CMake to avoid dupplication as much as possible (when feasible). It also introduce two CMake functions to automatically generate the layout required by `kernels::get_local_kernel` and hub based build, so it easy to debug and publish.
1 parent 0d501fd commit 73476d3

File tree

7 files changed

+877
-6
lines changed

7 files changed

+877
-6
lines changed
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
name: "Build and test kernel - Windows"
2+
on:
3+
push:
4+
branches: [main]
5+
pull_request:
6+
branches: [main]
7+
types: [opened, synchronize, reopened] # trigger on PRs
8+
workflow_dispatch:
9+
10+
jobs:
11+
build:
12+
strategy:
13+
matrix:
14+
os: [ windows-2022 ]
15+
python: [ '3.12', '3.13' ]
16+
torch: [
17+
{ version: '2.8', cuda: '12.9.1', wheel: '129' }
18+
]
19+
20+
name: Build kernel
21+
runs-on: ${{ matrix.os }}
22+
23+
steps:
24+
- uses: actions/cache@v4
25+
with:
26+
key: cuda-toolkit-v${{ matrix.cuda }}-${{ matrix.os }}
27+
path: |
28+
C:\Program Files\NVIDIA GPU Computing Toolkit
29+
~/.cargo/registry
30+
~/.cargo/git
31+
32+
- uses: actions/checkout@v5
33+
34+
# CUDA environment setup
35+
- uses: N-Storm/cuda-toolkit@v0.2.28
36+
id: setup-cuda-toolkit
37+
with:
38+
cuda: ${{ matrix.torch.cuda }} # TODO(mfuntowicz): How can we test multiple CUDA versions than align with torch?
39+
- name: "NVCC checks"
40+
run: nvcc -V
41+
42+
# Rust build environment setup
43+
- uses: actions-rs/toolchain@v1
44+
with:
45+
toolchain: stable
46+
profile: minimal
47+
override: true
48+
49+
- name: Build build2cmake
50+
run: ( cd build2cmake && cargo build --release )
51+
52+
# Python environment setup
53+
- uses: actions/setup-python@v6
54+
with:
55+
python-version: ${{ matrix.python }}
56+
cache: 'pip'
57+
58+
- name: Install PyTorch
59+
run: pip install torch --index-url https://download.pytorch.org/whl/cu129
60+
61+
- name: Build activation kernel
62+
run: ( scripts\windows\builder.ps1 -SourceFolder examples/activation -BuildConfig Release -Backend cuda -Build -Force )
63+
# - name: Copy activation kernel
64+
# run: cp -rL examples/activation/build activation-kernel
65+
66+
- name: Build cutlass GEMM kernel
67+
run: ( scripts\windows\builder.ps1 -SourceFolder examples/cutlass-gemm -BuildConfig Release -Backend cuda -Build -Force )
68+
# - name: Copy cutlass GEMM kernel
69+
# run: cp -rL examples/cutlass-gemm/result cutlass-gemm-kernel
70+
71+
- name: Build relu kernel
72+
run: ( scripts\windows\builder.ps1 -SourceFolder examples/relu -BuildConfig Release -Backend cuda -Build -Force )
73+
# - name: Copy relu kernel
74+
# run: cp -rL examples/relu/result relu-kernel
75+
76+
- name: Build relu-backprop-compile kernel
77+
run: ( scripts\windows\builder.ps1 -SourceFolder examples/relu-backprop-compile -BuildConfig Release -Backend cuda -Build -Force )
78+
# - name: Copy relu-backprop-compile kernel
79+
# run: cp -rL examples/relu-backprop-compile/result relu-backprop-compile-kernel
80+
81+
# Just test that we build with the extra torchVersions argument.
82+
# - name: Build relu kernel (specific Torch version)
83+
# run: ( cd examples/relu-specific-torch && nix build . )
84+
85+
- name: Build silu-and-mul-universal kernel
86+
run: ( scripts\windows\builder.ps1 -SourceFolder examples/silu-and-mul-universal -BuildConfig Release -Build -Force)

build2cmake/src/templates/cuda/preamble.cmake

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,3 +98,23 @@ else()
9898
${GPU_LANG}
9999
"${${GPU_LANG}_SUPPORTED_ARCHS}")
100100
endif()
101+
102+
103+
message(STATUS "Rendered for platform {{ platform }}")
104+
{% if platform == 'windows' %}
105+
include(${CMAKE_CURRENT_LIST_DIR}/cmake/windows.cmake)
106+
107+
# Generate standardized build name
108+
run_python(TORCH_VERSION "import torch; print(torch.__version__.split('+')[0])" "Failed to get Torch version")
109+
run_python(CXX11_ABI_VALUE "import torch; print('TRUE' if torch._C._GLIBCXX_USE_CXX11_ABI else 'FALSE')" "Failed to get CXX11 ABI")
110+
cmake_host_system_information(RESULT HOST_ARCH QUERY OS_PLATFORM)
111+
112+
set(SYSTEM_STRING "${HOST_ARCH}-windows")
113+
114+
if(GPU_LANG STREQUAL "CUDA")
115+
generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" ${CXX11_ABI_VALUE} "cuda" "${CUDA_VERSION}" "${SYSTEM_STRING}")
116+
elseif(GPU_LANG STREQUAL "HIP")
117+
run_python(ROCM_VERSION "import torch.version; print(torch.version.hip.split('.')[0] + '.' + torch.version.hip.split('.')[1])" "Failed to get ROCm version")
118+
generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" ${CXX11_ABI_VALUE} "rocm" "${ROCM_VERSION}" "${SYSTEM_STRING}")
119+
endif()
120+
{% endif %}

build2cmake/src/templates/cuda/torch-extension.cmake

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,17 @@ define_gpu_extension_target(
99
USE_SABI 3
1010
WITH_SOABI)
1111

12-
target_link_options({{ ops_name }} PRIVATE -static-libstdc++)
12+
if( NOT MSVC)
13+
target_link_options({{ ops_name }} PRIVATE -static-libstdc++)
14+
endif()
1315

16+
{% if platform == 'windows' %}
17+
# These methods below should be included from preamble.cmake on windows platform.
18+
19+
# Add kernels_install target for huggingface/kernels library layout
20+
add_kernels_install_target({{ ops_name }} "{{ name }}" "${BUILD_VARIANT_NAME}")
21+
22+
# Add local_install target for local development with get_local_kernel()
23+
add_local_install_target({{ ops_name }} "{{ name }}" "${BUILD_VARIANT_NAME}")
24+
25+
{% endif %}
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
# Generate a standardized build variant name following the pattern:
2+
# torch<VERSION>-<ABI>-<COMPUTE>-windows
3+
#
4+
# Arguments:
5+
# OUT_BUILD_NAME - Output variable name
6+
# TORCH_VERSION - PyTorch version (e.g., "2.7.1")
7+
# CXX11_ABI - Whether C++11 ABI is enabled (TRUE/FALSE)
8+
# COMPUTE_FRAMEWORK - One of: cuda, rocm, metal, xpu
9+
# COMPUTE_VERSION - Version of compute framework (e.g., "12.4" for CUDA, "6.0" for ROCm)
10+
# Example output: torch271-cxx11-cu124-x86_64-windows
11+
#
12+
function(generate_build_name OUT_BUILD_NAME TORCH_VERSION CXX11_ABI COMPUTE_FRAMEWORK COMPUTE_VERSION)
13+
# Flatten version by removing dots and padding to 2 components
14+
string(REPLACE "." ";" VERSION_LIST "${TORCH_VERSION}")
15+
list(LENGTH VERSION_LIST VERSION_COMPONENTS)
16+
17+
# Pad to at least 2 components
18+
if(VERSION_COMPONENTS LESS 2)
19+
list(APPEND VERSION_LIST "0")
20+
endif()
21+
22+
# Take first 2 components and join without dots
23+
list(GET VERSION_LIST 0 MAJOR)
24+
list(GET VERSION_LIST 1 MINOR)
25+
set(FLATTENED_TORCH "${MAJOR}${MINOR}")
26+
27+
# Generate compute string
28+
if(COMPUTE_FRAMEWORK STREQUAL "cuda")
29+
# Flatten CUDA version (e.g., "12.4" -> "124")
30+
string(REPLACE "." ";" COMPUTE_VERSION_LIST "${COMPUTE_VERSION}")
31+
list(LENGTH COMPUTE_VERSION_LIST COMPUTE_COMPONENTS)
32+
if(COMPUTE_COMPONENTS GREATER_EQUAL 2)
33+
list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
34+
list(GET COMPUTE_VERSION_LIST 1 COMPUTE_MINOR)
35+
set(COMPUTE_STRING "cu${COMPUTE_MAJOR}${COMPUTE_MINOR}")
36+
else()
37+
list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
38+
set(COMPUTE_STRING "cu${COMPUTE_MAJOR}0")
39+
endif()
40+
elseif(COMPUTE_FRAMEWORK STREQUAL "rocm")
41+
# Flatten ROCm version (e.g., "6.0" -> "60")
42+
string(REPLACE "." ";" COMPUTE_VERSION_LIST "${COMPUTE_VERSION}")
43+
list(LENGTH COMPUTE_VERSION_LIST COMPUTE_COMPONENTS)
44+
if(COMPUTE_COMPONENTS GREATER_EQUAL 2)
45+
list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
46+
list(GET COMPUTE_VERSION_LIST 1 COMPUTE_MINOR)
47+
set(COMPUTE_STRING "rocm${COMPUTE_MAJOR}${COMPUTE_MINOR}")
48+
else()
49+
list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
50+
set(COMPUTE_STRING "rocm${COMPUTE_MAJOR}0")
51+
endif()
52+
elseif(COMPUTE_FRAMEWORK STREQUAL "xpu")
53+
# Flatten XPU version (e.g., "2025.2" -> "202552")
54+
string(REPLACE "." ";" COMPUTE_VERSION_LIST "${COMPUTE_VERSION}")
55+
list(LENGTH COMPUTE_VERSION_LIST COMPUTE_COMPONENTS)
56+
if(COMPUTE_COMPONENTS GREATER_EQUAL 2)
57+
list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
58+
list(GET COMPUTE_VERSION_LIST 1 COMPUTE_MINOR)
59+
set(COMPUTE_STRING "xpu${COMPUTE_MAJOR}${COMPUTE_MINOR}")
60+
else()
61+
list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
62+
set(COMPUTE_STRING "xpu${COMPUTE_MAJOR}0")
63+
endif()
64+
else()
65+
message(FATAL_ERROR "Unknown compute framework: ${COMPUTE_FRAMEWORK}")
66+
endif()
67+
68+
# Assemble the final build name
69+
if(ABI_STRING STREQUAL "")
70+
set(BUILD_NAME "torch${FLATTENED_TORCH}-${COMPUTE_STRING}-windows")
71+
else()
72+
set(BUILD_NAME "torch${FLATTENED_TORCH}-${ABI_STRING}-${COMPUTE_STRING}-windows")
73+
endif()
74+
75+
set(${OUT_BUILD_NAME} "${BUILD_NAME}" PARENT_SCOPE)
76+
message(STATUS "Generated build name: ${BUILD_NAME}")
77+
endfunction()
78+
79+
#
80+
# Create a custom install target for the huggingface/kernels library layout.
81+
# This installs the extension into a directory structure suitable for kernel hub discovery:
82+
# <PREFIX>/<BUILD_VARIANT_NAME>/<PACKAGE_NAME>/
83+
#
84+
# Arguments:
85+
# TARGET_NAME - Name of the target to create the install rule for
86+
# PACKAGE_NAME - Python package name (e.g., "activation")
87+
# BUILD_VARIANT_NAME - Build variant name (e.g., "torch271-cxx11-cu124-x86_64-linux")
88+
# INSTALL_PREFIX - Base installation directory (defaults to CMAKE_INSTALL_PREFIX)
89+
#
90+
function(add_kernels_install_target TARGET_NAME PACKAGE_NAME BUILD_VARIANT_NAME)
91+
set(oneValueArgs INSTALL_PREFIX)
92+
cmake_parse_arguments(ARG "" "${oneValueArgs}" "" ${ARGN})
93+
94+
if(NOT ARG_INSTALL_PREFIX)
95+
set(ARG_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
96+
endif()
97+
98+
# Create the kernels_install target if it doesn't exist
99+
if(NOT TARGET kernels_install)
100+
add_custom_target(kernels_install ALL
101+
COMMENT "Installing all kernels to hub-compatible layout"
102+
VERBATIM)
103+
endif()
104+
105+
# Create a custom target for this specific kernel
106+
set(KERNEL_INSTALL_TARGET "${TARGET_NAME}_kernel_install")
107+
set(KERNEL_INSTALL_DIR "${ARG_INSTALL_PREFIX}/${BUILD_VARIANT_NAME}/${PACKAGE_NAME}")
108+
109+
add_custom_target(${KERNEL_INSTALL_TARGET} ALL
110+
COMMAND ${CMAKE_COMMAND} -E make_directory "${KERNEL_INSTALL_DIR}"
111+
COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:${TARGET_NAME}> "${KERNEL_INSTALL_DIR}/"
112+
COMMAND ${CMAKE_COMMAND} -E copy_directory
113+
"${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}"
114+
"${KERNEL_INSTALL_DIR}/"
115+
DEPENDS ${TARGET_NAME}
116+
COMMENT "Installing ${TARGET_NAME} to ${KERNEL_INSTALL_DIR}"
117+
VERBATIM)
118+
119+
# Make kernels_install depend on this specific kernel's install
120+
add_dependencies(kernels_install ${KERNEL_INSTALL_TARGET})
121+
122+
# Set folder for IDE organization
123+
if(MSVC OR XCODE)
124+
set_target_properties(${KERNEL_INSTALL_TARGET} PROPERTIES FOLDER "Install")
125+
endif()
126+
127+
message(STATUS "Added kernels_install target for ${TARGET_NAME} -> ${BUILD_VARIANT_NAME}/${PACKAGE_NAME}")
128+
endfunction()
129+
130+
#
131+
# Add install rules for local development with huggingface/kernels.
132+
# This installs the extension into the layout expected by get_local_kernel():
133+
# ${CMAKE_SOURCE_DIR}/build/<BUILD_VARIANT_NAME>/<PACKAGE_NAME>/
134+
#
135+
# This allows developers to use get_local_kernel() from the kernels library to load
136+
# locally built kernels without needing to publish to the hub.
137+
#
138+
# This uses the standard CMake install() command, so it works with the default
139+
# "install" target that is always available.
140+
#
141+
# Arguments:
142+
# TARGET_NAME - Name of the target to create the install rule for
143+
# PACKAGE_NAME - Python package name (e.g., "activation")
144+
# BUILD_VARIANT_NAME - Build variant name (e.g., "torch271-cxx11-cu124-x86_64-linux")
145+
#
146+
function(add_local_install_target TARGET_NAME PACKAGE_NAME BUILD_VARIANT_NAME)
147+
# Define your local, folder based, installation directory
148+
set(LOCAL_INSTALL_DIR "${CMAKE_SOURCE_DIR}/build/${BUILD_VARIANT_NAME}/${PACKAGE_NAME}")
149+
150+
# Glob Python files at configure time
151+
file(GLOB PYTHON_FILES "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/*.py")
152+
153+
# Create a custom target for local installation
154+
add_custom_target(local_install
155+
COMMENT "Installing files to local directory..."
156+
)
157+
158+
# Add custom commands to copy files
159+
add_custom_command(TARGET local_install POST_BUILD
160+
# Copy the shared library
161+
COMMAND ${CMAKE_COMMAND} -E copy_if_different
162+
$<TARGET_FILE:${TARGET_NAME}>
163+
${LOCAL_INSTALL_DIR}/
164+
165+
# Copy each Python file
166+
COMMAND ${CMAKE_COMMAND} -E copy_if_different
167+
${PYTHON_FILES}
168+
${LOCAL_INSTALL_DIR}/
169+
170+
COMMENT "Copying shared library and Python files to ${LOCAL_INSTALL_DIR}"
171+
COMMAND_EXPAND_LISTS
172+
)
173+
174+
file(MAKE_DIRECTORY ${LOCAL_INSTALL_DIR})
175+
message(STATUS "Added install rules for ${TARGET_NAME} -> build/${BUILD_VARIANT_NAME}/${PACKAGE_NAME}")
176+
endfunction()

build2cmake/src/torch/cuda.rs

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use std::collections::HashSet;
2+
use std::env;
23
use std::io::Write;
34
use std::path::PathBuf;
45

@@ -12,6 +13,7 @@ use crate::version::Version;
1213
use crate::FileSet;
1314

1415
static CMAKE_UTILS: &str = include_str!("../templates/utils.cmake");
16+
static WINDOWS_UTILS: &str = include_str!("../templates/windows.cmake");
1517
static REGISTRATION_H: &str = include_str!("../templates/registration.h");
1618
static HIPIFY: &str = include_str!("../templates/cuda/hipify.py");
1719
static CUDA_SUPPORTED_ARCHS_JSON: &str = include_str!("../cuda_supported_archs.json");
@@ -155,6 +157,13 @@ fn write_cmake(
155157
.entry(utils_path.clone())
156158
.extend_from_slice(CMAKE_UTILS.as_bytes());
157159

160+
let mut windows_utils_path = PathBuf::new();
161+
windows_utils_path.push("cmake");
162+
windows_utils_path.push("windows.cmake");
163+
file_set
164+
.entry(windows_utils_path.clone())
165+
.extend_from_slice(WINDOWS_UTILS.as_bytes());
166+
158167
let mut hipify_path = PathBuf::new();
159168
hipify_path.push("cmake");
160169
hipify_path.push("hipify.py");
@@ -184,7 +193,7 @@ fn write_cmake(
184193
render_kernel(env, kernel_name, kernel, cmake_writer)?;
185194
}
186195

187-
render_extension(env, ops_name, cmake_writer)?;
196+
render_extension(env, name, ops_name, cmake_writer)?;
188197

189198
Ok(())
190199
}
@@ -351,11 +360,17 @@ pub fn render_kernel(
351360
Ok(())
352361
}
353362

354-
pub fn render_extension(env: &Environment, ops_name: &str, write: &mut impl Write) -> Result<()> {
363+
pub fn render_extension(
364+
env: &Environment,
365+
name: &str,
366+
ops_name: &str,
367+
write: &mut impl Write,
368+
) -> Result<()> {
355369
env.get_template("cuda/torch-extension.cmake")
356370
.wrap_err("Cannot get Torch extension template")?
357371
.render_to_write(
358372
context! {
373+
name => name,
359374
ops_name => ops_name,
360375
},
361376
&mut *write,
@@ -382,7 +397,7 @@ pub fn render_preamble(
382397
cuda_minver => cuda_minver.map(|v| v.to_string()),
383398
cuda_maxver => cuda_maxver.map(|v| v.to_string()),
384399
cuda_supported_archs => cuda_supported_archs(),
385-
400+
platform => env::consts::OS
386401
},
387402
&mut *write,
388403
)

0 commit comments

Comments
 (0)