[triton][beta] [Cherry-pick] '[BENCH] Integrate hipblas in roofline measurement (#8216)' (#995)

agron911 · meta-codesync[bot] · commit 50ab08ddc37e · 2026-02-27T19:32:51.000-08:00
Summary: Pull Request resolved: #995 This is a cherry-pick of an upstream PR: triton-lang/triton#8216 Upstream commit message: ``` > [BENCH] Integrate hipblas in roofline measurement (#8216) ``` ***Do not remove the following line from this commit*** Reactor Cherry-pick Revision: 11b19e4 --- This diff was generated by running: ``` buck run fbcode//triton/tools/reactor:reactor -- cherrypick --num-commits 1 ``` Reviewed By: dshi7 Differential Revision: D94471625 fbshipit-source-id: 76103b2d01e3d89c61026093be278aabefd766d5
diff --git a/python/test/unit/runtime/test_blaslt.py b/python/test/unit/runtime/test_blaslt.py
@@ -23,7 +23,7 @@ def test_blaslt(m, n, k, dtype_str, device):
         if dtype_str == "float8_e4m3fn" and not is_hip_cdna4():
             pytest.skip("float8_e4m3fn is only supported on HIP CDNA4")
         c_dtype = torch.float16 if dtype_str in ("float8_e4m3fnuz", "float8_e4m3fn") else dtype
-        make_handle = lambda workspace: vendor.hipblas.HipBlasLt(workspace)
+        make_handle = lambda workspace: vendor.hipblas.HipblasLt(workspace)
     else:
         pytest.skip("test_blaslt is only supported on CUDA or HIP")
 
diff --git a/python/triton_kernels/triton_kernels/roofline.py b/python/triton_kernels/triton_kernels/roofline.py
@@ -1,11 +1,12 @@
 import ctypes
 import matplotlib.pyplot as plt
 import triton
-from triton._C.libtriton import nvidia
+from triton._C.libtriton import nvidia, amd
 import torch
 import csv
 from dataclasses import dataclass
 import inspect
+from .target_info import is_hip, is_cuda
 
 
 @dataclass
@@ -84,23 +85,48 @@ def inject_proxy_and_call(val, args, kwargs):
 
 
 def get_memset_tbps():
-    # Measure device memory set bandwidth using CUDA driver API (cuMemsetD8Async)
-    if torch.version.cuda is None:
-        raise RuntimeError("get_memset_tbps is only supported on CUDA")
-    # load cuda
-    cuda = ctypes.CDLL("libcuda.so")
-    cuda.cuInit.argtypes = [ctypes.c_uint]
-    cuda.cuInit.restype = ctypes.c_int
-    if cuda.cuInit(0) != 0:
-        raise RuntimeError("cuInit failed")
-    # initialize cuMemsetD8Async
-    cuda.cuMemsetD8Async.argtypes = [ctypes.c_uint64, ctypes.c_ubyte, ctypes.c_size_t, ctypes.c_void_p]
-    cuda.cuMemsetD8Async.restype = ctypes.c_int
-    # benchmark `cuMemsetD8Async`
     n_bytes = 1 << 32
     buf = torch.empty(n_bytes, device="cuda", dtype=torch.uint8)
-    dptr = ctypes.c_uint64(buf.data_ptr())
-    fn = lambda: cuda.cuMemsetD8Async(dptr, ctypes.c_ubyte(0), ctypes.c_size_t(n_bytes), ctypes.c_void_p(0))
+    stream0 = ctypes.c_void_p(0)
+
+    if is_cuda():
+        libname = "libcuda.so"
+        init_name = "cuInit"
+        memset_name = "cuMemsetD8Async"
+        memset_argtypes = [ctypes.c_uint64, ctypes.c_ubyte, ctypes.c_size_t, ctypes.c_void_p]
+        dptr = ctypes.c_uint64(buf.data_ptr())
+        value = ctypes.c_ubyte(0)
+    elif is_hip():
+        libname = "libamdhip64.so"
+        init_name = "hipInit"
+        memset_name = "hipMemsetAsync"
+        memset_argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t, ctypes.c_void_p]
+        dptr = ctypes.c_void_p(buf.data_ptr())
+        value = ctypes.c_int(0)
+    else:
+        raise RuntimeError("Unsupported platform: neither CUDA nor ROCm detected")
+
+    lib = ctypes.CDLL(libname)
+
+    # optional init
+    if hasattr(lib, init_name):
+        init_fn = getattr(lib, init_name)
+        init_fn.argtypes = [ctypes.c_uint]
+        init_fn.restype = ctypes.c_int
+        init_fn(0)
+
+    if not hasattr(lib, memset_name):
+        raise RuntimeError(f"{memset_name} not found in {libname}")
+
+    memset_fn = getattr(lib, memset_name)
+    memset_fn.argtypes = memset_argtypes
+    memset_fn.restype = ctypes.c_int
+
+    def fn():
+        err = memset_fn(dptr, value, ctypes.c_size_t(n_bytes), stream0)
+        if err != 0:
+            raise RuntimeError(f"{memset_name} failed with error {err}")
+
     time_ms = triton.testing.do_bench(fn, rep=1000)
     tbps = (n_bytes / (time_ms * 1e-3)) * 1e-12
     return tbps
@@ -109,13 +135,20 @@ def get_memset_tbps():
 def get_cublas_tflops(dtype):
     dtype = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp8": torch.float8_e4m3fn}[dtype]
     cublas_workspace = torch.empty(32 * 1024 * 1024, device="cuda", dtype=torch.uint8)
-    cublas = nvidia.cublas.CublasLt(cublas_workspace)
+    if is_cuda():
+        cublas = nvidia.cublas.CublasLt(cublas_workspace)
+        bench_fn = cublas.matmul
+    elif is_hip():
+        hipblas = amd.hipblas.HipblasLt(cublas_workspace)
+        bench_fn = hipblas.matmul
+    else:
+        raise RuntimeError("Unsupported platform: neither CUDA nor ROCm detected")
     device = "cuda"
     M, N, K = 8192, 8192, 8192
     a = torch.randn(M, K, device=device, dtype=torch.float32).to(dtype)
     b = torch.randn(K, N, device=device, dtype=torch.float32).to(dtype).T
     c = torch.empty((M, N), device=device, dtype=dtype)
-    time_ms = triton.testing.do_bench(lambda: cublas.matmul(a, b, c), rep=1000)
+    time_ms = triton.testing.do_bench(lambda: bench_fn(a, b, c), rep=1000)
     return 2 * M * N * K / time_ms * 1e-9
 
 
diff --git a/third_party/amd/include/hipblas_instance.h b/third_party/amd/include/hipblas_instance.h
@@ -12,7 +12,7 @@
 constexpr int HIPBLAS_COMPUTE_32F_FAST_F8 = 104;
 constexpr int HIPBLAS_COMPUTE_32F_FAST_FBF_OCP = 105;
 
-class HipBlasLtInstance {
+class HipblasLtInstance {
   // Typedefs for hipblas functions
   typedef hipblasStatus_t (*hipblasLtCreate_t)(hipblasLtHandle_t *);
   typedef hipblasStatus_t (*hipblasLtDestroy_t)(hipblasLtHandle_t);
@@ -264,7 +264,7 @@ class HipBlasLtInstance {
   }
 
 public:
-  HipBlasLtInstance(uint64_t workspace, size_t workspaceSize)
+  HipblasLtInstance(uint64_t workspace, size_t workspaceSize)
       : workspace((void *)workspace), workspaceSize(workspaceSize) {
     loadHipBlasDylib();
     successOrExit(hipblasLtCreate(&ltHandle));
@@ -273,7 +273,7 @@ class HipBlasLtInstance {
         preference, HIPBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize,
         sizeof(workspaceSize)));
   }
-  ~HipBlasLtInstance() {
+  ~HipblasLtInstance() {
     if (preference)
       successOrExit(hipblasLtMatmulPreferenceDestroy(preference));
 
diff --git a/third_party/amd/python/triton_amd.cc b/third_party/amd/python/triton_amd.cc
@@ -515,15 +515,15 @@ void init_triton_amd(py::module &&m) {
   });
 
   auto hipBlas = m.def_submodule("hipblas");
-  py::class_<HipBlasLtInstance>(hipBlas, "HipBlasLt")
+  py::class_<HipblasLtInstance>(hipBlas, "HipblasLt")
       .def(py::init<>([&](py::object &workspace) {
         auto wrk_ptr = workspace.attr("data_ptr")().cast<uint64_t>();
         auto wrk_size = workspace.attr("numel")().cast<size_t>() *
                         workspace.attr("element_size")().cast<size_t>();
-        return new HipBlasLtInstance(wrk_ptr, wrk_size);
+        return new HipblasLtInstance(wrk_ptr, wrk_size);
       }))
       .def("matmul",
-           [](HipBlasLtInstance &self, py::object &A, py::object &B,
+           [](HipblasLtInstance &self, py::object &A, py::object &B,
               py::object &C) {
              auto A_ptr = A.attr("data_ptr")().cast<uint64_t>();
              auto B_ptr = B.attr("data_ptr")().cast<uint64_t>();
@@ -532,7 +532,7 @@ void init_triton_amd(py::module &&m) {
              self.matmul(init.m, init.n, init.k, A_ptr, B_ptr, C_ptr,
                          init.dtype);
            })
-      .def("gemm", [](HipBlasLtInstance &self, py::object &A, py::object &B,
+      .def("gemm", [](HipblasLtInstance &self, py::object &A, py::object &B,
                       py::object &C, py::object &D, float alpha, float beta) {
         auto A_ptr = A.attr("data_ptr")().cast<uint64_t>();
         auto B_ptr = B.attr("data_ptr")().cast<uint64_t>();