[AMD][BACKEND] Switch to code object v5 (#5005)

AlexAUT · web-flow · commit 91302ea36ddc · 2024-12-12T10:04:37.000-08:00
Switches to code object v5 which requires to bump `rocm` to `6.2+` to avoid segfaults for `device_prints` and `tl.num_programs`. The added unit test covers the previous `segfault` with `device_prints` for 2d tensors. This is a preparation to test llvm/llvm-project@c4d8920
diff --git a/README.md b/README.md
@@ -253,5 +253,5 @@ Supported Platforms:
 Supported Hardware:
 
 - NVIDIA GPUs (Compute Capability 8.0+)
-- AMD GPUs (ROCm 5.2+)
+- AMD GPUs (ROCm 6.2+)
 - Under development: CPUs
diff --git a/python/test/unit/language/print_helper.py b/python/test/unit/language/print_helper.py
@@ -90,9 +90,17 @@ def kernel_print_pointer(X, Y, BLOCK: tl.constexpr):
     tl.device_print("ptr ", X + tl.arange(0, BLOCK))
 
 
+@triton.jit
+def kernel_print_2d_tensor(X, Y, BLOCK_SIZE_X: tl.constexpr, BLOCK_SIZE_Y: tl.constexpr):
+    off_x = tl.arange(0, BLOCK_SIZE_X)
+    off_y = tl.arange(0, BLOCK_SIZE_Y)
+    x = tl.load(X + off_x[:, None] * BLOCK_SIZE_Y + off_y[None, :])
+    tl.device_print("", x)
+
+
 def test_print(func: str, data_type: str, device: str):
     N = 128  # This value should match with test_print in test_subprocess.py.
-    # TODO(antiagainst): Currently the warp count is chosen to make sure wedon't have multiple
+    # TODO(antiagainst): Currently the warp count is chosen to make sure we don't have multiple
     # threads printing duplicated messages due to broadcasting. Improve print op lowering logic
     # to filter out duplicated data range.
     num_warps = N // get_current_target_warp_size()
@@ -128,12 +136,18 @@ def test_print(func: str, data_type: str, device: str):
         kernel_device_print_hex[(1, )](x, y, num_warps=num_warps, BLOCK=N)
     elif func == "device_print_pointer":
         kernel_print_pointer[(1, )](x, y, num_warps=num_warps, BLOCK=N)
+    elif func == "device_print_2d_tensor":
+        BLOCK_SIZE_X = num_warps
+        BLOCK_SIZE_Y = get_current_target_warp_size()
+        x_2d_tensor = x.reshape((BLOCK_SIZE_X, BLOCK_SIZE_Y))
+        kernel_print_2d_tensor[(1, )](x_2d_tensor, y, num_warps=num_warps, BLOCK_SIZE_X=BLOCK_SIZE_X,
+                                      BLOCK_SIZE_Y=BLOCK_SIZE_Y)
     else:
         assert f"Unknown kernel: {func}"
 
     if func != "print_no_arg" and func != "no_arg_print" and func != "device_print_large" and \
        func != "print_multiple_args" and func != "device_print_multiple_args" and \
-       func != "device_print_pointer" and func != "device_print_scalar":
+       func != "device_print_pointer" and func != "device_print_scalar" and func != "device_print_2d_tensor":
         assert_close(y, x)
 
     # Wait until driver complete all the jobs for the device_print, especially test_subprocess
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -2525,21 +2525,18 @@ def histogram_kernel(x_ptr, z_ptr, M: tl.constexpr, N: tl.constexpr):
 def test_optimize_thread_locality(op, BLOCK_N, N, num_pid_n, device):
 
     @triton.jit
-    def kernel(X, Y, N, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, NUM_PID_N: tl.constexpr):
+    def kernel(X, Y, N, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
         start_m = tl.program_id(0)
         pid_n = tl.program_id(1)
+        num_pid_n = tl.num_programs(1)
         local = INITIALIZE_PATCH
         off_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-        for start_n in range(pid_n, tl.cdiv(N, BLOCK_N), NUM_PID_N):
+        for start_n in range(pid_n, tl.cdiv(N, BLOCK_N), num_pid_n):
             off_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
             Xs = X + off_m[:, None] * N + off_n[None, :]
             x = tl.load(Xs)
             local = ACCUMULATE_PATCH
-        tl.store(Y + off_m * NUM_PID_N + pid_n, local)
-        # the following segfaults AMD backend following #3492
-        # really unclear why; the llvm-ir and kernel arguments are
-        # identical !
-        # tl.store(Y + off_m * tl.num_programs(1) + pid_n, local)
+        tl.store(Y + off_m * num_pid_n + pid_n, local)
 
     initialize_patch = {
         'sum': 'tl.zeros([BLOCK_M], dtype=tl.float32)',
@@ -2561,7 +2558,7 @@ def kernel(X, Y, N, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, NUM_PID_N: tl.
     BLOCK_M = 32
     x = torch.randn((BLOCK_M, N), dtype=torch.float32, device=device)
     y = torch.randn((BLOCK_M, num_pid_n), dtype=torch.float32, device=device)
-    h = kernel[(1, num_pid_n, 1)](x, y, N, BLOCK_M, BLOCK_N, NUM_PID_N=num_pid_n)
+    h = kernel[(1, num_pid_n, 1)](x, y, N, BLOCK_M, BLOCK_N)
     if not is_interpreter():
         assert h.asm['ttgir'].count(
             '"tt.reduce"') == 2, "tt.reduce should be called twice, otherwise the optimization didn't work"
diff --git a/python/test/unit/language/test_subprocess.py b/python/test/unit/language/test_subprocess.py
@@ -4,6 +4,8 @@
 import sys
 from collections import Counter
 
+import triton
+
 import pytest
 
 dir_path = os.path.dirname(os.path.realpath(__file__))
@@ -35,6 +37,7 @@ def is_interpreter():
                                                       ("device_print_pointer", "int32"),
                                                       ("device_print_negative", "int32"),
                                                       ("device_print_uint", "uint32"),
+                                                      ("device_print_2d_tensor", "int32"),
                                                   ])
 def test_print(func_type: str, data_type: str, device: str):
     proc = subprocess.run(
@@ -101,6 +104,13 @@ def test_print(func_type: str, data_type: str, device: str):
     elif func_type == "device_print_pointer":
         for i in range(N):
             expected_lines[f"pid (0, 0, 0) idx ({i:3}) ptr: 0x"] = 1
+    elif func_type == "device_print_2d_tensor":
+        warp_size = triton.runtime.driver.active.get_current_target().warp_size
+        x_dim = N // warp_size
+        y_dim = warp_size
+        for x in range(x_dim):
+            for y in range(y_dim):
+                expected_lines[f"pid (0, 0, 0) idx ({x}, {y:2}): {(x * y_dim + y)}"] = 1
 
     actual_lines = Counter()
     for line in outs:
diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
@@ -321,7 +321,7 @@ def make_llir(src, metadata, options):
         # Set various control constants on the LLVM module so that device
         # libraries can resolve references to them.
         amd.set_isa_version(llvm_mod, options.arch)
-        amd.set_abi_version(llvm_mod, 400)
+        amd.set_abi_version(llvm_mod, 500)
         amd.set_bool_control_constant(llvm_mod, "__oclc_finite_only_opt", False)
         amd.set_bool_control_constant(llvm_mod, "__oclc_correctly_rounded_sqrt32", True)
         amd.set_bool_control_constant(llvm_mod, "__oclc_unsafe_math_opt", False)