Enabling support for enhanced customization of Nvidia ptxas options (#6993)

ptorru · nvptorru · web-flow · commit ba94c21309ba · 2025-06-13T15:02:04.000+01:00
# Enabling enhanced `ptxas` customization This MR enables broader support for `ptxas` customization via the following functionality: * Ability to pass specific `ptxas` options. Available options are documented [here](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#ptxas-options) * Ability to pass these options for specific kernel calls Benefits: * Enables parameters to be passed to `ptxas`. * Enables targeted customization of the compilation behavior for each specific kernel call. Usage: Pass a string with `ptxas` options as the function parameter `ptx_options` in any given kernel call. Example: For tutorial `03-matrix-multiplication.py` one can enable `opt-level 3` for `leaky_relu` and `opt-level 0` for `matmul_kernel` like so: ```python ... if ACTIVATION == "leaky_relu": accumulator = leaky_relu(accumulator,ptx_options="--opt-level=0") ... matmul_kernel[grid]( a, b, c, # M, N, K, # a.stride(0), a.stride(1), # b.stride(0), b.stride(1), # c.stride(0), c.stride(1), # ACTIVATION=activation, # ptx_options="--opt-level=0" ) ``` Testing done: This was tested by modifying the following python tutorials: * `02-fused-softmax` * `03-matrix-multiplication` I checked the behavior of cached compiles, I can confirm the cache works as expected for different options on a given kernel. --------- Co-authored-by: Pedro Torruella <ptorruella@nvidia.com>
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
@@ -106,6 +106,7 @@ class CUDAOptions:
     maxnreg: Optional[int] = None
     cluster_dims: tuple = (1, 1, 1)
     ptx_version: int = None
+    ptx_options: str = None
     ir_override: Optional[str] = None  # filename of a user-defined IR (*.{ttir|ttgir|llir|ptx})
     enable_fp_fusion: bool = True
     launch_cooperative_grid: bool = False
@@ -407,8 +408,17 @@ def make_cubin(self, src, metadata, opt, capability):
             line_info = ["-lineinfo", "-suppress-debug-info"] if knobs.compilation.disable_line_info else ["-lineinfo"]
             fmad = [] if opt.enable_fp_fusion else ['--fmad=false']
             arch = sm_arch_from_capability(capability)
-            opt_level = ['--opt-level', '0'] if knobs.nvidia.disable_ptxas_opt else []
-            ptxas_cmd = [ptxas, *line_info, *fmad, '-v', *opt_level, f'--gpu-name={arch}', fsrc.name, '-o', fbin]
+
+            # Disable ptxas optimizations if requested
+            disable_opt = ['--opt-level', '0'] if knobs.nvidia.disable_ptxas_opt else []
+
+            # Accept more ptxas options if provided
+            ptx_extra_options = opt.ptx_options.split(" ") if opt.ptx_options else []
+
+            ptxas_cmd = [
+                ptxas, *line_info, *fmad, '-v', *disable_opt, *ptx_extra_options, f'--gpu-name={arch}', fsrc.name, '-o',
+                fbin
+            ]
             try:
                 subprocess.run(ptxas_cmd, check=True, close_fds=False, stderr=flog)
                 if os.path.exists(fsrc.name):