[PTXAS] Upgrade ptxas to 12.9.86 for blackwell (#8476)

ThomasRaoux · web-flow · commit 3b3c852dd69b · 2025-10-17T19:48:01.000-07:00
Keep the old ptxas for pre-blackwell due to functional regressions
diff --git a/.gitignore b/.gitignore
@@ -65,6 +65,7 @@ cmake-build-*
 cuobjdump
 nvdisasm
 ptxas
+ptxas-blackwell
 
 # Third-party include
 third_party/nvidia/backend/include
diff --git a/cmake/nvidia-toolchain-version.json b/cmake/nvidia-toolchain-version.json
@@ -1,4 +1,5 @@
 {
+  "ptxas-blackwell": "12.9.86",
   "ptxas": "12.8.93",
   "cuobjdump": "12.8.55",
   "nvdisasm": "12.8.55",
diff --git a/python/triton/knobs.py b/python/triton/knobs.py
@@ -488,6 +488,7 @@ class nvidia_knobs(base_knobs):
     cuobjdump: env_nvidia_tool = env_nvidia_tool("cuobjdump")
     nvdisasm: env_nvidia_tool = env_nvidia_tool("nvdisasm")
     ptxas: env_nvidia_tool = env_nvidia_tool("ptxas")
+    ptxas_blackwell: env_nvidia_tool = env_nvidia_tool("ptxas-blackwell")
 
     dump_nvptx: env_bool = env_bool("NVPTX_ENABLE_DUMP")
     disable_ptxas_opt: env_bool = env_bool("DISABLE_PTXAS_OPT")
diff --git a/setup.py b/setup.py
@@ -541,6 +541,17 @@ def download_and_copy_dependencies():
         url_func=lambda system, arch, version:
         f"https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/{system}-{arch}/cuda_nvcc-{system}-{arch}-{version}-archive.tar.xz",
     )
+
+    # We download a separate ptxas for blackwell, since there are some bugs when using it for hopper
+    download_and_copy(
+        name="nvcc",
+        src_func=lambda system, arch, version: f"cuda_nvcc-{system}-{arch}-{version}-archive/bin/ptxas{exe_extension}",
+        dst_path="bin/ptxas-blackwell",
+        variable="TRITON_PTXAS_BLACKWELL_PATH",
+        version=NVIDIA_TOOLCHAIN_VERSION["ptxas-blackwell"],
+        url_func=lambda system, arch, version:
+        f"https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/{system}-{arch}/cuda_nvcc-{system}-{arch}-{version}-archive.tar.xz",
+    )
     download_and_copy(
         name="cuobjdump",
         src_func=lambda system, arch, version:
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
@@ -31,16 +31,16 @@ def check_dot_compatibility(lhs_type, rhs_type) -> Tuple[int, int, int]:  # [m,
     return check_dot_compatibility
 
 
-def get_ptxas() -> knobs.NvidiaTool:
-    return knobs.nvidia.ptxas
+def get_ptxas(arch: int) -> knobs.NvidiaTool:
+    return knobs.nvidia.ptxas_blackwell if arch >= 100 else knobs.nvidia.ptxas
 
 
 @functools.lru_cache()
-def get_ptxas_version():
+def get_ptxas_version(arch: int = 80):
     mock_ver = knobs.nvidia.mock_ptx_version
     if mock_ver is not None:
         return mock_ver  # This is not really a version of ptxas, but it is good enough for testing
-    version = subprocess.check_output([get_ptxas().path, "--version"]).decode("utf-8")
+    version = subprocess.check_output([get_ptxas(arch).path, "--version"]).decode("utf-8")
     return version
 
 
@@ -71,7 +71,7 @@ def ptx_get_version(cuda_version) -> int:
 def get_ptx_version_from_options(options, arch: int):
     ptx_version = options.ptx_version
     if ptx_version is None:
-        cuda_version = get_ptxas().version
+        cuda_version = get_ptxas(arch).version
         ptx_version = ptx_get_version(cuda_version)
     return ptx_version
 
@@ -465,7 +465,7 @@ def make_ptx(self, src, metadata, opt, capability):
         return ret
 
     def make_cubin(self, src, metadata, opt, capability):
-        ptxas = get_ptxas().path
+        ptxas = get_ptxas(self.target.arch).path
         with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.ptx') as fsrc, \
             tempfile.NamedTemporaryFile(delete=False, mode='r', suffix='.log') as flog:
             fsrc.write(src)
@@ -555,5 +555,5 @@ def add_stages(self, stages, options, language):
 
     @functools.lru_cache()
     def hash(self):
-        version = get_ptxas_version()
+        version = get_ptxas_version(self.target.arch)
         return f'{version}-{self.target.arch}'

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`{`
	`2`	`+ "ptxas-blackwell": "12.9.86",`
`2`	`3`	`"ptxas": "12.8.93",`
`3`	`4`	`"cuobjdump": "12.8.55",`
`4`	`5`	`"nvdisasm": "12.8.55",`