compile_kernel remove header_code arg (pytorch#163165)

msaroufim · pytorchmergebot · commit a89d5e97ec39 · 2025-09-17T19:47:32.000Z
We previously asked users to seperate these because we didn't have any way of adding extern C declarations. Now we don't and we don't need this confusing flag anymore BC breaking but is fine for this API since it doesn't have major users yet. Please just put your all your code in `kernel_source` moving forward ## BC note The header_code parameter has been removed from torch.cuda._compile_kernel. Previously, users could pass separate header code that would be prepended to the kernel source. Now, header code must be included directly in the kernel_source parameter. Note this only affects torch.cuda._compile_kernel, which is a private API. Example: Before ```python kernel = compile_kernel( kernel_source="global void my_kernel() { ... }", kernel_name="my_kernel", header_code="#define SCALE 2.0f\n__device_ float scale(float x) { return x * SCALE; }" ) ``` After ```python kernel_source = """ #define SCALE 2.0f device float scale(float x) { return x * SCALE; } global void my_kernel() { ... } """ kernel = _compile_kernel(kernel_source, "my_kernel") ``` Pull Request resolved: pytorch#163165 Approved by: https://github.com/janeyx99, https://github.com/albanD
diff --git a/test/test_cuda.py b/test/test_cuda.py
@@ -6848,25 +6848,21 @@ def test_compile_kernel(self):
         self.assertEqual(c_int, expected_int)
 
         # Test with header code
-        header_code = """
+        scale_kernel_source = """
         #define SCALE_FACTOR 2.0f
 
         __device__ float scale_value(float val) {
             return val * SCALE_FACTOR;
         }
-        """
 
-        scale_kernel_source = """
         __global__ void scale_tensors(const float* input, float* output, int n) {
             int i = threadIdx.x + blockIdx.x * blockDim.x;
             if (i < n)
                 output[i] = scale_value(input[i]);
         }
         """
 
-        scale_kernel = _compile_kernel(
-            scale_kernel_source, "scale_tensors", header_code=header_code
-        )
+        scale_kernel = _compile_kernel(scale_kernel_source, "scale_tensors")
 
         input_tensor = torch.rand(N, device="cuda")
         output_tensor = torch.empty_like(input_tensor)
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
@@ -1733,7 +1733,6 @@ def _compile_kernel(
     kernel_source: str,
     kernel_name: str,
     compute_capability: Optional[str] = None,
-    header_code: str = "",
     cuda_include_dirs: Optional[list] = None,
     nvcc_options: Optional[list] = None,
 ):
@@ -1750,7 +1749,6 @@ def _compile_kernel(
         kernel_name (str): The name of the kernel function to compile
         compute_capability (str, optional): The compute capability to target (e.g., "86").
                                            If None, will detect from current device.
-        header_code (str, optional): Additional header code to prepend to the kernel source
         cuda_include_dirs (list, optional): List of directories containing CUDA headers
         nvcc_options (list, optional): Additional options to pass to NVRTC
 
@@ -1780,7 +1778,6 @@ def _compile_kernel(
         kernel_source,
         kernel_name,
         compute_capability,
-        header_code,
         cuda_include_dirs,
         nvcc_options,
     )
diff --git a/torch/cuda/_utils.py b/torch/cuda/_utils.py
@@ -114,7 +114,6 @@ def _nvrtc_compile(
     kernel_source: str,
     kernel_name: str,
     compute_capability: Optional[str] = None,
-    header_code: str = "",
     cuda_include_dirs: Optional[list] = None,
     nvcc_options: Optional[list] = None,
     auto_pch: bool = False,
@@ -127,7 +126,6 @@ def _nvrtc_compile(
         kernel_name (str): The name of the kernel function to compile
         compute_capability (str, None): The compute capability to target (e.g., "86").
                                            If None, will detect from current device.
-        header_code (str, optional): Additional header code to prepend to the kernel source
         cuda_include_dirs (list, None): List of directories containing CUDA headers
         nvcc_options (list, None): Additional options to pass to NVRTC
         auto_pch (bool): Enable automatic precompiled headers (CUDA 12.8+)
@@ -156,14 +154,8 @@ def check_nvrtc(result: int) -> None:
             )
             raise RuntimeError(f"CUDA error: {error_message}")
 
-    # Combine header code and kernel source
-    if header_code:
-        full_source = header_code + "\n" + kernel_source
-    else:
-        full_source = kernel_source
-
     # Convert source to bytes
-    source_bytes = full_source.encode("utf-8")
+    source_bytes = kernel_source.encode("utf-8")
 
     # Get compute capability if not provided
     if compute_capability is None: