Merge commit '0173f7524d8cfc9a5b4b52dec0010eaedef14526'

whitneywhtsang · whitneywhtsang · commit e52429af61da · 2025-10-07T22:53:17.000Z
diff --git a/.github/workflows/integration-tests-amd.yml b/.github/workflows/integration-tests-amd.yml
@@ -13,6 +13,7 @@ jobs:
   integration-tests-amd:
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 45
+    continue-on-error: ${{ matrix.runner[1] == 'gfx90a' }}
     strategy:
       matrix:
         runner: ${{ fromJson(inputs.matrix) }}
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -1659,6 +1659,8 @@ def kernel(X, SHAPE0: tl.constexpr, SHAPE1: tl.constexpr):
 @pytest.mark.parametrize("num_ctas", num_ctas_list)
 @pytest.mark.parametrize("dtype_str", ["int32", "int64"])
 def test_atomic_cas(sem, num_ctas, dtype_str, device):
+    if is_hip_cdna2():
+        pytest.skip("Disabled due to being flaky on CDNA2")
     # 1. make sure that atomic_cas changes the original value (Lock)
     @triton.jit
     def change_value(Lock, triton_dtype: tl.constexpr):
diff --git a/python/triton/experimental/gluon/language/amd/gfx1250/__init__.py b/python/triton/experimental/gluon/language/amd/gfx1250/__init__.py
@@ -59,8 +59,8 @@ def wmma_scaled(a, a_scale, a_format, b, b_scale, b_format, acc, _semantic=None)
     "accumulator tensor's layout must be (16, 16, 128)"
 
     # TODO: Add more formats
-    assert a_format.value in {"e2m1"}, f"Unsupported lhs_format: {a_format.value}"
-    assert b_format.value in {"e2m1"}, f"Unsupported rhs_format: {b_format.value}"
+    assert a_format.value in {"e2m1", "e4m3", "e5m2"}, f"Unsupported lhs_format: {a_format.value}"
+    assert b_format.value in {"e2m1", "e4m3", "e5m2"}, f"Unsupported rhs_format: {b_format.value}"
 
     assert a_scale is not None and b_scale is not None, "Scales must not be None"
 
diff --git a/python/triton_kernels/tests/test_matmul.py b/python/triton_kernels/tests/test_matmul.py
@@ -197,6 +197,7 @@ class Case:
     x_transpose: bool = False
     w_transpose: bool = False
     y_transpose: bool = False
+    colmajor_mxfp_weight: bool = True
 
 
 @pytest.mark.parametrize(
@@ -269,6 +270,7 @@ class Case:
             Case(1000, 704, 800, "batched", "mxfloat8_e4m3fn", "mxfloat4_e2m1", 2, 1),
             Case(1000, 704, 800, "ragged", "mxfloat8_e4m3fn", "mxfloat4_e2m1", 8, 2, split_k=9),
             Case(1000, 704, 800, "ragged", "mxfloat8_e4m3fn", "mxfloat4_e2m1", 8, 2, split_k=9, hbm_swizzling=True),
+            Case(1000, 704, 800, "ragged", "mxfloat8_e4m3fn", "mxfloat4_e2m1", 8, 2, split_k=9, colmajor_mxfp_weight=False),
             Case(1000, 704, 800, "ragged", "mxfloat8_e4m3fn", "mxfloat4_e2m1", 8, 2),
             Case(1000, 704, 800, "ragged", "mxfloat8_e4m3fn", "mxfloat4_e2m1", 8, 2, hbm_swizzling=True),
             Case(300, 400, 400, "ragged", "mxfloat8_e4m3fn", "mxfloat8_e4m3fn", 8, 4),
@@ -315,7 +317,7 @@ class Case:
 @pytest.mark.parametrize("has_y_gammas", [False, True])
 @pytest.mark.parametrize("is_persistent", [False, True])
 def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, inner_expt_opt, has_y_gammas, is_persistent, n_expts_tot,
-            n_expts_act, mode, act_dtype_str, weight_dtype_str, block_m, hbm_swizzling, epilogue_subtile,
+            n_expts_act, mode, act_dtype_str, weight_dtype_str, block_m, hbm_swizzling, colmajor_mxfp_weight, epilogue_subtile,
             x_transpose, w_transpose, y_transpose,
             device, opt_flags_scope):
     # TODO: remove when Triton FP8 supports proper RTNE
@@ -463,14 +465,72 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, inner_expt_o
             w_scale_layout, w_scale_layout_opts = layout.make_default_matmul_mxfp4_w_scale_layout(
                 mx_axis=mx_axis, num_warps=8)
         # downcast to mxfp
-        w_tri, w_scale_tri = downcast_to_mxfp(w_tri, weight_dtype, axis=mx_axis)
-        w_ref = upcast_from_mxfp(w_tri, w_scale_tri, torch.bfloat16, axis=mx_axis)
-        w_tri_dtype = FP4 if "float4" in weight_dtype_str else weight_dtype
-        w_tri = wrap_torch_tensor(w_tri, w_tri_dtype)
-        w_scale_tri = wrap_torch_tensor(w_scale_tri)
-        # convert layouts
-        w_tri = convert_layout(w_tri, w_layout, **w_layout_opts)
-        w_scale_tri = convert_layout(w_scale_tri, w_scale_layout, **w_scale_layout_opts)
+        w_tri_orig = w_tri
+        if colmajor_mxfp_weight:
+            w_tri, w_scale_tri = downcast_to_mxfp(w_tri, weight_dtype, axis=mx_axis)
+            w_ref = upcast_from_mxfp(w_tri, w_scale_tri, torch.bfloat16, axis=mx_axis)
+            w_tri_dtype = FP4 if "float4" in weight_dtype_str else weight_dtype
+            w_tri = wrap_torch_tensor(w_tri, w_tri_dtype)
+            w_scale_tri = wrap_torch_tensor(w_scale_tri)
+            # convert layouts
+            w_tri = convert_layout(w_tri, w_layout, **w_layout_opts)
+            w_scale_tri = convert_layout(w_scale_tri, w_scale_layout, **w_scale_layout_opts)
+        else:
+            if torch.cuda.get_device_capability()[0] < 10:
+                pytest.skip("transposed mxfp weight not supported with cuda capability < 10")
+            if block_m == 16:
+                pytest.skip("PassManager::run failed from Triton compiler")
+            # TODO: swizzling for rowmajor
+
+            # A typical use case is we already quantized col-major weight,
+            # and we want matmul with its transposed row-major weight w/o
+            # requantization.
+
+            # put abs_max of each 32x32 block to diagonal so scales of transposed agree
+            w_ndim = w_tri.ndim
+            if w_ndim == 2:
+                w_tri = w_tri.unsqueeze(0)
+            BLOCK_SIZE = int(MXFP_BLOCK_SIZE)
+            for e, i, j in itertools.product(range(w_tri.shape[0]), range(0, w_tri.shape[1], BLOCK_SIZE), range(0, w_tri.shape[2], BLOCK_SIZE)):
+                i_end = min(i+BLOCK_SIZE, w_tri.shape[1])
+                j_end = min(j+BLOCK_SIZE, w_tri.shape[2])
+                block = w_tri[e, i:i_end, j:j_end]
+                m_abs = block.abs().max()
+                i_len = i_end - i
+                j_len = j_end - j
+                min_len = min(i_len, j_len)
+                signs = torch.randint(0, 2, (max(i_len, j_len),), device=w_tri.device) * 2 - 1
+                block.diagonal(dim1=-2, dim2=-1)[:] = signs[:min_len] * m_abs
+                if j_len > i_len:
+                    block[i_len - 1, i_len:] = signs[min_len:] * m_abs
+                elif i_len > j_len:
+                    block[j_len:, j_len - 1] = signs[min_len:] * m_abs
+            if w_ndim == 2:
+                w_tri = w_tri.squeeze(0)
+
+            # matmul with rowmajor weight expects scale is separately
+            # constructed (not much additional memory needed).
+            _, w_scale_tri = downcast_to_mxfp(w_tri, weight_dtype, axis=mx_axis)
+            # reuse quantized value from colmajor
+            w_tri_rowmajor, w_scale_tri_rowmajor = downcast_to_mxfp(w_tri.mT.contiguous(), weight_dtype, axis=mx_axis)
+            w_ref = upcast_from_mxfp(w_tri_rowmajor, w_scale_tri_rowmajor, torch.bfloat16, axis=mx_axis).mT.contiguous()
+            w_tri = w_tri_rowmajor.data.mT
+
+            def _pad_and_block(x: torch.Tensor) -> torch.Tensor:
+                x = torch.nn.functional.pad(x, (0, x.shape[-1] % BLOCK_SIZE), mode="replicate")
+                return x.view(*x.shape[:-1], x.shape[-1] // BLOCK_SIZE, BLOCK_SIZE)
+
+            # check if generated scale is transpose-invariant as intended construction
+            # [cdiv(K, 32), N] -> dedup to [cdiv(K, 32), cdiv(N, 32)]
+            w_scale_tri_blocked = _pad_and_block(w_scale_tri)
+            w_scale_tri_sampled = w_scale_tri_blocked[..., 0:1]
+            # [cdiv(N, 32), K] -> dedup to [cdiv(N, 32), cdiv(K, 32)]
+            w_scale_tri_rowmajor_blocked = _pad_and_block(w_scale_tri_rowmajor)
+            w_scale_tri_rowmajor_sampled = w_scale_tri_rowmajor_blocked[..., 0:1]
+            assert torch.equal(w_scale_tri_sampled.expand_as(w_scale_tri_blocked), w_scale_tri_blocked)
+            assert torch.equal(w_scale_tri_rowmajor_sampled.expand_as(w_scale_tri_rowmajor_blocked), w_scale_tri_rowmajor_blocked)
+            assert torch.equal(w_scale_tri_sampled.squeeze(-1), w_scale_tri_rowmajor_sampled.squeeze(-1).mT)
+
         precision_opt.weight_scale = w_scale_tri
     epilogue = None
     if act_mxfp8:
@@ -479,7 +539,7 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, inner_expt_o
         is_input_batched = x_tri.ndim == 3
         y_shape = x_tri.shape if is_input_batched else (1,) + x_tri.shape
         n_rows = y_shape[1] if gindx is None or mode == "batched" else gindx.dst_indx.shape[0]
-        y_shape = (y_shape[0], n_rows, w_tri.shape[-1])
+        y_shape = (y_shape[0], n_rows, w_tri_orig.shape[-1])
         if sindx is None or mode == "batched":
             if not is_input_batched:
                 y_shape = (y_shape[1], y_shape[2])
diff --git a/python/triton_kernels/tests/test_mxfp.py b/python/triton_kernels/tests/test_mxfp.py
@@ -45,6 +45,22 @@ def test_mxfp4_rounding_cases(dst_dtype, device):
     assert_equal(dequant_torch, dequant)
 
 
+@pytest.mark.parametrize("src_dtype", ["float4_e2m1", "float8_e5m2", "float8_e4m3fn"])
+@pytest.mark.parametrize("dst_dtype", ["float16", "bfloat16", "float32"])
+def test_mxfp_extreme_values(src_dtype, dst_dtype, device):
+    if "float8" in src_dtype and (is_cuda() and torch.cuda.get_device_capability()[0] < 9):
+        pytest.skip("Float8 not tested on A100")
+    src_dtype = dtype_str_to_torch(src_dtype)
+    dst_dtype = dtype_str_to_torch(dst_dtype)
+    BIG_VALUE = 65470 if dst_dtype == torch.float16 else 3.3895e38
+    x = torch.tensor([BIG_VALUE, BIG_VALUE], dtype=dst_dtype, device=device)
+    xq_value, xq_scale = downcast_to_mxfp(x, src_dtype, axis=-1)
+    xdq = upcast_from_mxfp(xq_value, xq_scale, dst_dtype, axis=-1)
+    xdq_ref = upcast_from_mxfp_torch(xq_value, xq_scale, dst_dtype, axis=-1)
+    assert_equal(xdq_ref, xdq)
+    assert not xdq.isinf().any()
+
+
 @pytest.mark.parametrize("src_dtype", ["float4_e2m1", "float8_e5m2", "float8_e4m3fn"])
 @pytest.mark.parametrize("dst_dtype", ["float16", "bfloat16", "float32"])
 def test_mxfp_quant_dequant(src_dtype, dst_dtype, device):
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs.py
@@ -17,6 +17,7 @@
 from .matmul_ogs_details._p_matmul_ogs import _p_matmul_ogs, get_per_device_per_stream_alloc_fn
 from .matmul_ogs_details._reduce_grouped import _reduce_grouped
 from .numerics_details.mxfp import MXFP_BLOCK_SIZE
+from .tensor_details.layout_details.strided import StridedLayout
 from .matmul_ogs_details.opt_flags import make_opt_flags, update_opt_flags_constraints, InapplicableConstraint
 from .specialize import specialize
 from .tensor import Storage, Tensor, FP4, bitwidth, wrap_torch_tensor
@@ -441,12 +442,13 @@ def matmul_ogs(x, w, bias,
     w_scale = precision_config.weight_scale
     w_has_mx = w_scale is not None
     is_hopper_fp8 = is_cuda() and not target_info.cuda_capability_geq(10, 0) and bitwidth(w.dtype) == 8
-    if w_has_mx: assert w.stride(-2) == 1, "`w` must be column-major when it has data-type mxfp"
     if is_hopper_fp8: assert w.stride(-2) == 1, "`w` must be column-major when it has data-type FP8 on capability < 10"
     if not isinstance(w, Tensor):
         # TODO: remove this code path; using uint8 for mxfp4 weight will bite us when we want to support uint8 for real
         dtype = FP4 if w.dtype == torch.uint8 else w.dtype
         w = wrap_torch_tensor(w, dtype=dtype)
+    if w_has_mx and (torch.cuda.get_device_capability()[0] < 10 or w.storage.layout is not None and not isinstance(w.storage.layout, StridedLayout)):
+        assert w.stride(-2) == 1, "`w` must be column-major when it has data-type mxfp and (swizzled or not on >=Blackwell)"
     if w_scale is not None and not isinstance(w_scale, Tensor):
         w_scale = Tensor(w_scale)
     if w_scale is not None:
diff --git a/python/triton_kernels/triton_kernels/numerics_details/mxfp.py b/python/triton_kernels/triton_kernels/numerics_details/mxfp.py
@@ -297,6 +297,17 @@ def upcast_from_mxfp_torch(tensor: torch.Tensor, scale: torch.Tensor, target_dty
     padded_tensor = padded_tensor.view(*new_shape)
     dq_scale_padded = dq_scale.unsqueeze(-1)  # shape: [..., ceil(axis_shape/32), 1]
     out_padded = padded_tensor * dq_scale_padded
+    # Need to clamp since due to rounding, we can have overflow that was within
+    # the range before quantization.
+    # e.g., 3.3895e+38 -> log2(3.3895e+38 / max_fp8e4m3=448) ~= 119.17 -> round
+    # up to 120 + exp_bias=127 -> scale=247
+    # 3.3895e+38 / 2**120 ~= 254.9976 -> round to 256 in fp8e4m3fn
+    # Dequantization: 256 * 2**120 > 3.4e38 overflowing 3.38953139e38
+    finfo = torch.finfo(target_dtype)
+    out_padded = (padded_tensor * dq_scale_padded).clamp(finfo.min, finfo.max)
+    if tensor.dtype == torch.float8_e5m2:
+        # fp8e5m2 can have inf and we want to preserve so separately handle
+        out_padded = out_padded.where(~padded_tensor.isinf(), padded_tensor.to(target_dtype))
 
     # Flatten back and remove the padded tail
     out_padded = out_padded.view(*fp32_tensor.shape[:-1], new_axis_shape)
diff --git a/python/triton_kernels/triton_kernels/numerics_details/mxfp_details/_upcast_from_mxfp.py b/python/triton_kernels/triton_kernels/numerics_details/mxfp_details/_upcast_from_mxfp.py
@@ -119,6 +119,16 @@ def _upcast_from_mxfp(out_ptr, stride_o_outer, stride_o_quant: tl.constexpr, mx_
     scale = scale.reshape(dst_scale.shape)
 
     out_tensor = dst_tensor * dst_scale
+    if dst_dtype == tl.float32:
+        max_fin = 3.4028234663852886e+38
+    elif dst_dtype == tl.bfloat16:
+        max_fin = 3.3895313892515355e+38
+    else:
+        tl.static_assert(dst_dtype == tl.float16)
+        max_fin = 65504
+    # TODO: handle infinity same as upcast_from_mxfp_torch together with the
+    # above FIXME
+    out_tensor = tl.clamp(out_tensor, min=-max_fin, max=max_fin)
     # Correct any NaNs encoded via the scale.
     out_tensor = tl.where(scale == 0xFF, float("nan"), out_tensor)
     out_tensor = out_tensor.reshape([BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_DIM])
diff --git a/test/Conversion/amd/async_ops_to_llvm.mlir b/test/Conversion/amd/async_ops_to_llvm.mlir
@@ -138,7 +138,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
   tt.func public @async_commit_group(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32},
                                      %arg1: i32 {tt.divisibility = 16 : i32},
                                      %arg2: !ttg.memdesc<32x64xf16, #shared, #smem, mutable>) {
-    // CHECK-NEXT: llvm.mlir.constant(0 : i32) : i32
+    // CHECK: llvm.mlir.constant(0 : i32) : i32
     // CHECK-NEXT: llvm.return
     ttg.async_commit_group
     tt.return
diff --git a/test/Conversion/amd/ds_transpose.mlir b/test/Conversion/amd/ds_transpose.mlir
@@ -5,6 +5,8 @@
 #mma32_scaled = #ttg.amd_mfma<{version = 4, warpsPerCTA = [2, 2], instrShape = [32, 32, 64], isTransposed = true}>
 #shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1]}>
 #shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
+#padding = #ttg.padded_shared<[512:+16] {order = [0, 1], shape = [128, 64]}>
+#padding_vec1 = #ttg.padded_shared<[1:+4] {order = [0, 1], shape = [128, 64]}>
 #smem = #ttg.shared_memory
 
 #linear_ds_tr_tile_out = #ttg.linear<{register = [[0, 1], [0, 2], [0, 8], [0, 4]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [32, 0], [16, 0]], warp = [[0, 0], [0, 0]], block = []}>
@@ -688,4 +690,25 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
     tt.store %ptr1, %a1 : tensor<64x16x!tt.ptr<bf16>, #linear_ds_tr_tile_invalid>
     tt.return
   }
+
+  //  CHECK-LABEL: ds_transpose_with_padding
+  tt.func @ds_transpose_with_padding(%arg0: !ttg.memdesc<128x64xf16, #padding, #smem, mutable>, %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}) {
+    // CHECK-COUNT-16: rocdl.ds.read.tr16.b64 %{{.*}} : <3> -> vector<4xf16>
+    // CHECK-NOT: rocdl.ds.read.tr16.b64
+    %1 = ttg.local_load %arg0 : !ttg.memdesc<128x64xf16, #padding, #smem, mutable> -> tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma16, kWidth = 8}>>
+
+    %ptr1 = tt.splat %arg2 : !tt.ptr<f16> -> tensor<128x64x!tt.ptr<f16>, #ttg.dot_op<{opIdx = 0, parent = #mma16, kWidth = 8}>>
+    tt.store %ptr1, %1 : tensor<128x64x!tt.ptr<f16>, #ttg.dot_op<{opIdx = 0, parent = #mma16, kWidth = 8}>>
+    tt.return
+  }
+
+  //  CHECK-LABEL: ds_transpose_padding_interval_too_small
+  tt.func @ds_transpose_padding_interval_too_small(%arg0: !ttg.memdesc<128x64xf16, #padding_vec1, #smem, mutable>, %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}) {
+    // CHECK-NOT: rocdl.ds.read.tr16.b64
+    %1 = ttg.local_load %arg0 : !ttg.memdesc<128x64xf16, #padding_vec1, #smem, mutable> -> tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma16, kWidth = 8}>>
+
+    %ptr1 = tt.splat %arg2 : !tt.ptr<f16> -> tensor<128x64x!tt.ptr<f16>, #ttg.dot_op<{opIdx = 0, parent = #mma16, kWidth = 8}>>
+    tt.store %ptr1, %1 : tensor<128x64x!tt.ptr<f16>, #ttg.dot_op<{opIdx = 0, parent = #mma16, kWidth = 8}>>
+    tt.return
+  }
 }
diff --git a/test/Conversion/amd/minmax.mlir b/test/Conversion/amd/minmax.mlir
@@ -12,7 +12,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
 // GFX942: llvm.intr.maxnum
 
 // GFX950: llvm.func @min_max
-// GFX950-NEXT: llvm.intr.minimum
+// GFX950: llvm.intr.minimum
 // GFX950-NEXT: llvm.intr.maximum
   tt.func public @min_max(%arg0: f32, %arg1: f32) {
     %0 = arith.minimumf %arg0, %arg1 : f32
diff --git a/test/TritonGPU/amd/amd-conditional-barrier.mlir b/test/TritonGPU/amd/amd-conditional-barrier.mlir
@@ -4,8 +4,8 @@ module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32,
   tt.func @conditional_barrier() {
     // CHECK-LABEL: llvm.func @conditional_barrier
 
-    // CHECK:   %[[CMP0:.+]] = llvm.icmp "ne" %3, %1 : i32
-    // CHECK:   %[[CMP1:.+]] = llvm.icmp "eq" %3, %1 : i32
+    // CHECK:   %[[CMP0:.+]] = llvm.icmp "ne" %[[OP0:.+]], %[[OP1:.+]] : i32
+    // CHECK:   %[[CMP1:.+]] = llvm.icmp "eq" %[[OP0]], %[[OP1]] : i32
     // CHECK:   llvm.cond_br %[[CMP0]], ^bb1, ^bb2
     // CHECK: ^bb1:
     // CHECK:   rocdl.s.barrier
diff --git a/third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUAttrDefs.td b/third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUAttrDefs.td
@@ -34,6 +34,10 @@ class TritonAMDGPU_Attr<string name, list<Trait> traits = [],
   : AttrDef<TritonAMDGPU_Dialect, name, traits, baseCppClass> {
 }
 
+def SetFP8Clamping : TritonAMDGPU_Attr<"SetFP8Clamping"> {
+  let mnemonic = "amdgcn.set.fp8.clamping";
+}
+
 class TritonAMDGPU_I32Enum<string name, string description, list<I32EnumAttrCase> cases>
     : I32EnumAttr<name, description, cases> {
   let genSpecializedAttr = 0;
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/PatternTritonGPUOpToLLVM.h b/third_party/amd/lib/TritonAMDGPUToLLVM/PatternTritonGPUOpToLLVM.h
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TritonGPUToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/TritonGPUToLLVM.cpp
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ScheduleLoops.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ScheduleLoops.cpp
diff --git a/third_party/amd/python/test/test_gluon_gfx1250.py b/third_party/amd/python/test/test_gluon_gfx1250.py