triton-lang
diff --git a/‎test/TritonGPU/amd/accelerate-amd-matmul-wmma-gfx1250.mlir‎
Lines changed: 133 additions & 0 deletions b/‎test/TritonGPU/amd/accelerate-amd-matmul-wmma-gfx1250.mlir‎
Lines changed: 133 additions & 0 deletions
diff --git a/‎test/TritonGPU/amd/amd-scaled-upcast-gfx1250.mlir‎
Lines changed: 92 additions & 0 deletions b/‎test/TritonGPU/amd/amd-scaled-upcast-gfx1250.mlir‎
Lines changed: 92 additions & 0 deletions
diff --git a/‎third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUOps.td‎
Lines changed: 3 additions & 2 deletions b/‎third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUOps.td‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎third_party/amd/include/TritonAMDGPUToLLVM/PatternTritonAMDGPUToLLVM.h‎
Lines changed: 2 additions & 1 deletion b/‎third_party/amd/include/TritonAMDGPUToLLVM/PatternTritonAMDGPUToLLVM.h‎
Lines changed: 2 additions & 1 deletion
@@ -194,3 +194,136 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     tt.return
   }
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked2 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked4 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
+// CHECK{LITERAL}: #linear = #ttg.linear<{register = [[0, 1], [0, 2], [0, 4], [0, 8], [0, 16]], lane = [[0, 32], [0, 64], [1, 0], [2, 0], [4, 0]], warp = [[8, 0], [16, 0]], block = []}>
+// CHECK-LABEL: wmma_dot_scaled_mxfp8_bf16
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx1250", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @wmma_dot_scaled_mxfp8_bf16(
+      %arg0: tensor<32x128x!tt.ptr<f8E4M3FN>, #blocked4>,
+      %arg1: tensor<32x4x!tt.ptr<i8>, #blocked2>,
+      %arg2: tensor<128x32x!tt.ptr<bf16>, #blocked>,
+      %output: tensor<32x32x!tt.ptr<f32>, #blocked>
+      ) {
+    // CHECK: tt.load %arg1 {amdg.decomposed_dot_scaled_source = true} : tensor<32x4x!tt.ptr<i8>, #blocked1>
+    // CHECK: %[[SCALE:.*]] = tt.reshape {{.*}} : tensor<32x4x32xi8, #blocked3> -> tensor<32x128xi8, #linear>
+    // CHECK: %[[CVT0:.*]]  = ttg.convert_layout %[[SCALE]] : tensor<32x128xi8, #linear> -> tensor<32x128xi8, #blocked>
+    // CHECK: %[[UPCASTED:.*]] = amdg.scaled_upcast_fp8 {{.*}} scale %[[CVT0]] : tensor<32x128xf8E4M3FN, #blocked>, tensor<32x128xi8, #blocked> -> tensor<32x128xbf16, #blocked>
+    // CHECK: %[[SEL:.*]] = arith.select {{.*}}, {{.*}}, %[[UPCASTED]]
+    // CHECK: %[[CVT1:.*]] = ttg.convert_layout %[[SEL]] : tensor<32x128xbf16, #blocked> -> tensor<32x128xbf16, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>>
+    // CHECK: %[[OPND0:.*]] = ttg.convert_layout %[[CVT1]] : tensor<32x128xbf16, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>> -> tensor<32x128xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
+    // CHECK: tt.dot %[[OPND0]]
+    %a = tt.load %arg0 : tensor<32x128x!tt.ptr<f8E4M3FN>, #blocked4>
+    %scale = tt.load %arg1 : tensor<32x4x!tt.ptr<i8>, #blocked2>
+    %b = tt.load %arg2 : tensor<128x32x!tt.ptr<bf16>, #blocked>
+    %c = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #blocked>
+    %res = tt.dot_scaled %a scale %scale, %b, %c lhs = e4m3 rhs = bf16 {fastMath = false} : tensor<32x128xf8E4M3FN, #blocked4>, tensor<32x4xi8, #blocked2> * tensor<128x32xbf16, #blocked> -> tensor<32x32xf32, #blocked>
+
+    tt.store %output, %res : tensor<32x32x!tt.ptr<f32>, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked2 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked4 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
+// CHECK{LITERAL}: #linear = #ttg.linear<{register = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0]], lane = [[0, 1], [0, 2], [0, 4], [0, 8], [0, 16]], warp = [[32, 0], [64, 0]], block = []}>
+// CHECK-LABEL: wmma_dot_scaled_f16_mxfp8
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx1250", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @wmma_dot_scaled_f16_mxfp8(
+      %arg0: tensor<32x128x!tt.ptr<f16>, #blocked4>,
+      %arg1: tensor<32x4x!tt.ptr<i8>, #blocked2>,
+      %arg2: tensor<128x32x!tt.ptr<f8E5M2>, #blocked>,
+      %output: tensor<32x32x!tt.ptr<f32>, #blocked>
+      ) {
+    // CHECK: %[[TRANS:.*]] = tt.trans {{.*}} {order = array<i32: 0, 2, 1>} : tensor<4x32x32xi8, #blocked4> -> tensor<4x32x32xi8, #blocked5>
+    // CHECK: %[[SCALE:.*]] = tt.reshape %[[TRANS]] : tensor<4x32x32xi8, #blocked5> -> tensor<128x32xi8, #linear>
+    // CHECK: %[[CVT0:.*]] = ttg.convert_layout %[[SCALE]] : tensor<128x32xi8, #linear> -> tensor<128x32xi8, #blocked2>
+    // CHECK: %[[UPCASTED:.*]] = amdg.scaled_upcast_fp8 {{.*}} scale %[[CVT0]] : tensor<128x32xf8E5M2, #blocked2>, tensor<128x32xi8, #blocked2> -> tensor<128x32xf16, #blocked2>
+    // CHECK: %[[SEL:.*]] = arith.select {{.*}}, %cst, %[[UPCASTED]] : tensor<128x32xi1, #blocked2>, tensor<128x32xf16, #blocked2>
+    // CHECK: %[[CVT1:.*]] = ttg.convert_layout %[[SEL]] : tensor<128x32xf16, #blocked2> -> tensor<128x32xf16, #ttg.dot_op<{opIdx = 1, parent = #blocked2}>>
+    // CHECK: %[[OPND1:.*]] = ttg.convert_layout %[[CVT1]] : tensor<128x32xf16, #ttg.dot_op<{opIdx = 1, parent = #blocked2}>> -> tensor<128x32xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>
+    // CHECK: = tt.dot {{.*}}, %[[OPND1]]
+    %a = tt.load %arg0 : tensor<32x128x!tt.ptr<f16>, #blocked4>
+    %scale = tt.load %arg1 : tensor<32x4x!tt.ptr<i8>, #blocked2>
+    %b = tt.load %arg2 : tensor<128x32x!tt.ptr<f8E5M2>, #blocked>
+    %c = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #blocked>
+    %res = tt.dot_scaled %a, %b scale %scale, %c lhs = fp16 rhs = e5m2 {fastMath = false} : tensor<32x128xf16, #blocked4> * tensor<128x32xf8E5M2, #blocked>,  tensor<32x4xi8, #blocked2> -> tensor<32x32xf32, #blocked>
+
+    tt.store %output, %res : tensor<32x32x!tt.ptr<f32>, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked2 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked5 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+// CHECK{LITERAL}: #linear = #ttg.linear<{register = [[0, 1], [0, 2], [0, 4], [0, 8], [0, 16]], lane = [[0, 32], [1, 0], [2, 0], [4, 0], [8, 0]], warp = [[0, 0], [0, 0]], block = []}>
+// CHECK-LABEL: wmma_dot_scaled_mxfp4_bf16
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx1250", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @wmma_dot_scaled_mxfp4_bf16(
+      %arg0: tensor<16x32x!tt.ptr<i8>, #blocked5>,
+      %arg1: tensor<16x2x!tt.ptr<i8>, #blocked2>,
+      %arg2: tensor<64x16x!tt.ptr<bf16>, #blocked>,
+      %output: tensor<16x16x!tt.ptr<f32>, #blocked>
+      ) {
+    // CHECK: tt.load %arg1 {amdg.decomposed_dot_scaled_source = true} : tensor<16x2x!tt.ptr<i8>, #blocked1>
+    // CHECK: %[[SCALE:.*]] = tt.reshape {{.*}} : tensor<16x2x32xi8, #blocked3> -> tensor<16x64xi8, #linear>
+    // CHECK: %[[CVT0:.*]] = ttg.convert_layout %[[SCALE]] : tensor<16x64xi8, #linear> -> tensor<16x64xi8, #blocked>
+    // CHECK: %[[UPCASTED:.*]] = amdg.scaled_upcast_fp4 {{.*}} scale %[[CVT0]] {axis = 1 : i32} : tensor<16x32xi8, #blocked>, tensor<16x64xi8, #blocked> -> tensor<16x64xbf16, #blocked>
+    // CHECK: %[[SEL:.*]] = arith.select {{.*}}, %{{.*}}, %[[UPCASTED]] : tensor<16x64xi1, #blocked>, tensor<16x64xbf16, #blocked>
+    // CHECK: %[[CVT1:.*]] = ttg.convert_layout %[[SEL]] : tensor<16x64xbf16, #blocked> -> tensor<16x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>>
+    // CHECK: %[[OPND0:.*]] = ttg.convert_layout %[[CVT1]] : tensor<16x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>> -> tensor<16x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
+    // CHECK: tt.dot %[[OPND0]]
+    %a = tt.load %arg0 : tensor<16x32x!tt.ptr<i8>, #blocked5>
+    %scale = tt.load %arg1 : tensor<16x2x!tt.ptr<i8>, #blocked2>
+    %b = tt.load %arg2 : tensor<64x16x!tt.ptr<bf16>, #blocked>
+    %c = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #blocked>
+    %res = tt.dot_scaled %a scale %scale, %b, %c lhs = e2m1 rhs = bf16 {fastMath = false} : tensor<16x32xi8, #blocked5>, tensor<16x2xi8, #blocked2> * tensor<64x16xbf16, #blocked> -> tensor<16x16xf32, #blocked>
+
+    tt.store %output, %res : tensor<16x16x!tt.ptr<f32>, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked2 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked5 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+// CHECK{LITERAL}: #linear = #ttg.linear<{register = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0]], lane = [[0, 1], [0, 2], [0, 4], [0, 8], [32, 0]], warp = [[0, 0], [0, 0]], block = []}>
+// CHECK-LABEL: wmma_dot_scaled_fp16_mxfp4
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx1250", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @wmma_dot_scaled_fp16_mxfp4(
+      %arg0: tensor<16x64x!tt.ptr<f16>, #blocked5>,
+      %arg1: tensor<16x2x!tt.ptr<i8>, #blocked2>,
+      %arg2: tensor<32x16x!tt.ptr<i8>, #blocked>,
+      %output: tensor<16x16x!tt.ptr<f32>, #blocked>
+      ) {
+    // CHECK: tt.load %arg1 {amdg.decomposed_dot_scaled_source = true} : tensor<16x2x!tt.ptr<i8>, #blocked1>
+    // CHECK: %[[SCALE:.*]] = tt.reshape {{.*}} : tensor<2x32x16xi8, #blocked5> -> tensor<64x16xi8, #linear>
+    // CHECK: %[[CVT0:.*]] = ttg.convert_layout %[[SCALE]] : tensor<64x16xi8, #linear> -> tensor<64x16xi8, #blocked2>
+    // CHECK: %[[UPCASTED:.*]] = amdg.scaled_upcast_fp4 {{.*}} scale %[[CVT0]] {axis = 0 : i32} : tensor<32x16xi8, #blocked2>, tensor<64x16xi8, #blocked2> -> tensor<64x16xf16, #blocked2>
+    // CHECK: %[[SEL:.*]] = arith.select {{.*}}, %cst, %[[UPCASTED]] : tensor<64x16xi1, #blocked2>, tensor<64x16xf16, #blocked2>
+    // CHECK: %[[CVT1:.*]] = ttg.convert_layout %[[SEL]] : tensor<64x16xf16, #blocked2> -> tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #blocked2}>>
+    // CHECK: %[[OPND1:.*]] = ttg.convert_layout %[[CVT1]] : tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #blocked2}>> -> tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>
+    // CHECK: tt.dot {{.*}}, %[[OPND1]]
+    %a = tt.load %arg0 : tensor<16x64x!tt.ptr<f16>, #blocked5>
+    %scale = tt.load %arg1 : tensor<16x2x!tt.ptr<i8>, #blocked2>
+    %b = tt.load %arg2 : tensor<32x16x!tt.ptr<i8>, #blocked>
+    %c = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #blocked>
+    %res = tt.dot_scaled %a, %b scale %scale, %c lhs = fp16 rhs = e2m1 {fastMath = false} : tensor<16x64xf16, #blocked5> * tensor<32x16xi8, #blocked>, tensor<16x2xi8, #blocked2> -> tensor<16x16xf32, #blocked>
+
+    tt.store %output, %res : tensor<16x16x!tt.ptr<f32>, #blocked>
+    tt.return
+  }
+}
@@ -0,0 +1,92 @@
+// RUN: triton-opt %s -split-input-file --allocate-amdgpu-shared-memory --convert-triton-amdgpu-to-llvm="arch=gfx1250" --canonicalize --cse | FileCheck %s
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
+#mma = #ttg.amd_wmma<{version = 3, isTranspose = true, warpsPerCTA = [2, 2], instrShape = [16, 16, 32]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx1250", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @wmma_dot_scaled_mxfp8_bf16(%arg0: tensor<32x128xf8E4M3FN, #blocked>, %arg1: tensor<32x128xi8, #blocked>, %arg2: tensor<32x128x!tt.ptr<bf16>, #blocked>) {
+    // CHECK: %[[SCALE:.*]] = llvm.extractvalue %arg1[0] : !llvm.struct<(i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8)>
+    // CHECK: %[[SCALE_1:.*]] = llvm.extractvalue %arg1[8] : !llvm.struct<(i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8)>
+    // CHECK: %[[SCALE_2:.*]] = llvm.extractvalue %arg1[16] : !llvm.struct<(i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8)>
+    // CHECK: %[[SCALE_3:.*]] = llvm.extractvalue %arg1[24] : !llvm.struct<(i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8)>
+
+    // CHECK: llvm.insertelement %[[SCALE]], {{.*}} : vector<4xi8>
+    // CHECK: llvm.insertelement %[[SCALE]], {{.*}} : vector<4xi8>
+    // CHECK: llvm.insertelement %[[SCALE]], {{.*}} : vector<4xi8>
+    // CHECK: %[[V0:.*]] = llvm.insertelement %[[SCALE]], {{.*}} : vector<4xi8>
+    // CHECK: %[[SCALE_INT32:.*]] = llvm.bitcast %[[V0]] : vector<4xi8> to i32
+    // CHECK: rocdl.cvt.scale.pk8.bf16.fp8 {{.*}}, %[[SCALE_INT32]][0] : vector<8xbf16>
+
+    // CHECK: llvm.insertelement %[[SCALE_1]], {{.*}} : vector<4xi8>
+    // CHECK: llvm.insertelement %[[SCALE_1]], {{.*}} : vector<4xi8>
+    // CHECK: llvm.insertelement %[[SCALE_1]], {{.*}} : vector<4xi8>
+    // CHECK: %[[V1:.*]] = llvm.insertelement %[[SCALE_1]], {{.*}} : vector<4xi8>
+    // CHECK: %[[SCALE_INT32_1:.*]] = llvm.bitcast %[[V1]] : vector<4xi8> to i32
+    // CHECK: rocdl.cvt.scale.pk8.bf16.fp8 {{.*}}, %[[SCALE_INT32_1]][0] : vector<8xbf16>
+
+    // CHECK: llvm.insertelement %[[SCALE_2]], {{.*}} : vector<4xi8>
+    // CHECK: llvm.insertelement %[[SCALE_2]], {{.*}} : vector<4xi8>
+    // CHECK: llvm.insertelement %[[SCALE_2]], {{.*}} : vector<4xi8>
+    // CHECK: %[[V2:.*]] = llvm.insertelement %[[SCALE_2]], {{.*}} : vector<4xi8>
+    // CHECK: %[[SCALE_INT32_2:.*]] = llvm.bitcast %[[V2]] : vector<4xi8> to i32
+    // CHECK: rocdl.cvt.scale.pk8.bf16.fp8 {{.*}}, %[[SCALE_INT32_2]][0] : vector<8xbf16>
+
+    // CHECK: llvm.insertelement %[[SCALE_3]], {{.*}} : vector<4xi8>
+    // CHECK: llvm.insertelement %[[SCALE_3]], {{.*}} : vector<4xi8>
+    // CHECK: llvm.insertelement %[[SCALE_3]], {{.*}} : vector<4xi8>
+    // CHECK: %[[V3:.*]] = llvm.insertelement %[[SCALE_3]], {{.*}} : vector<4xi8>
+    // CHECK: %[[SCALE_INT32_3:.*]] = llvm.bitcast %[[V3]] : vector<4xi8> to i32
+    // CHECK: rocdl.cvt.scale.pk8.bf16.fp8 {{.*}}, %[[SCALE_INT32_3]][0] : vector<8xbf16>
+    %7 = amdg.scaled_upcast_fp8 %arg0 scale %arg1 : tensor<32x128xf8E4M3FN, #blocked>, tensor<32x128xi8, #blocked> -> tensor<32x128xbf16, #blocked>
+    tt.store %arg2, %7 : tensor<32x128x!tt.ptr<bf16>, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#mma = #ttg.amd_wmma<{version = 3, isTranspose = true, warpsPerCTA = [4, 1], instrShape = [16, 16, 32]}>
+#shared = #ttg.swizzled_shared<{vec = 4, perPhase = 4, maxPhase = 4, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 2048 : i32, ttg.target = "hip:gfx1250", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @cvt_scale_pk8_bf16_fp4(%output: tensor<16x64x!tt.ptr<bf16>, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>, %15: tensor<16x32xi8, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>, %27: tensor<16x64xi8, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>) attributes {noinline = false} {
+    // CHECK: %[[SCALE:.*]] = llvm.extractvalue %arg2[0] : !llvm.struct<(i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8)>
+    // CHECK: %[[SCALE_1:.*]] = llvm.extractvalue %arg2[8] : !llvm.struct<(i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8)>
+    // CHECK: %[[SCALE_2:.*]] = llvm.extractvalue %arg2[16] : !llvm.struct<(i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8)>
+    // CHECK: %[[SCALE_3:.*]] = llvm.extractvalue %arg2[24] : !llvm.struct<(i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8)>
+
+    // CHECK: llvm.insertelement %[[SCALE]], {{.*}} : vector<4xi8>
+    // CHECK: llvm.insertelement %[[SCALE]], {{.*}} : vector<4xi8>
+    // CHECK: llvm.insertelement %[[SCALE]], {{.*}} : vector<4xi8>
+    // CHECK: %[[V0:.*]] = llvm.insertelement %[[SCALE]], {{.*}} : vector<4xi8>
+    // CHECK: %[[SCALE_INT32:.*]] = llvm.bitcast %[[V0]] : vector<4xi8> to i32
+    // CHECK: rocdl.cvt.scale.pk8.bf16.fp4 {{.*}}, %[[SCALE_INT32]][0] : vector<8xbf16>
+
+    // CHECK: llvm.insertelement %[[SCALE_1]], {{.*}} : vector<4xi8>
+    // CHECK: llvm.insertelement %[[SCALE_1]], {{.*}} : vector<4xi8>
+    // CHECK: llvm.insertelement %[[SCALE_1]], {{.*}} : vector<4xi8>
+    // CHECK: %[[V1:.*]] = llvm.insertelement %[[SCALE_1]], {{.*}} : vector<4xi8>
+    // CHECK: %[[SCALE_INT32_1:.*]] = llvm.bitcast %[[V1]] : vector<4xi8> to i32
+    // CHECK: rocdl.cvt.scale.pk8.bf16.fp4 {{.*}}, %[[SCALE_INT32_1]][0] : vector<8xbf16>
+
+    // CHECK: llvm.insertelement %[[SCALE_2]], {{.*}} : vector<4xi8>
+    // CHECK: llvm.insertelement %[[SCALE_2]], {{.*}} : vector<4xi8>
+    // CHECK: llvm.insertelement %[[SCALE_2]], {{.*}} : vector<4xi8>
+    // CHECK: %[[V2:.*]] = llvm.insertelement %[[SCALE_2]], {{.*}} : vector<4xi8>
+    // CHECK: %[[SCALE_INT32_2:.*]] = llvm.bitcast %[[V2]] : vector<4xi8> to i32
+    // CHECK: rocdl.cvt.scale.pk8.bf16.fp4 {{.*}}, %[[SCALE_INT32_2]][0] : vector<8xbf16>
+
+    // CHECK: llvm.insertelement %[[SCALE_3]], {{.*}} : vector<4xi8>
+    // CHECK: llvm.insertelement %[[SCALE_3]], {{.*}} : vector<4xi8>
+    // CHECK: llvm.insertelement %[[SCALE_3]], {{.*}} : vector<4xi8>
+    // CHECK: %[[V3:.*]] = llvm.insertelement %[[SCALE_3]], {{.*}} : vector<4xi8>
+    // CHECK: %[[SCALE_INT32_3:.*]] = llvm.bitcast %[[V3]] : vector<4xi8> to i32
+    // CHECK: rocdl.cvt.scale.pk8.bf16.fp4 {{.*}}, %[[SCALE_INT32_3]][0] : vector<8xbf16>
+
+    %28 = amdg.scaled_upcast_fp4 %15 scale %27 {axis = 1 : i32} : tensor<16x32xi8, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>, tensor<16x64xi8, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> -> tensor<16x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
+    tt.store %output, %28 : tensor<16x64x!tt.ptr<bf16>, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
+    tt.return
+  }
+}
@@ -604,7 +604,7 @@ def ScaledUpcastFp4Op : TT_AMDGPU_Op<"scaled_upcast_fp4", [Pure, DeclareOpInterf
 
   let arguments = (ins
     RankedTensorOf<[I8]>:$input,
-    RankedTensorOf<[BF16]>:$scale,
+    RankedTensorOf<[BF16, I8]>:$scale,
     I32Attr:$axis);
   let results = (outs RankedTensorOf<[AnyTypeOf<[F16, BF16, F32]>]>:$output);
 
@@ -636,14 +636,15 @@ def ScaledUpcastFp8Op : TT_AMDGPU_Op<"scaled_upcast_fp8", [
 
   let arguments = (ins
     RankedTensorOf<[AnyTypeOf<[F8E4M3FN, F8E5M2]>]>:$input,
-    RankedTensorOf<[BF16]>:$scale);
+    RankedTensorOf<[BF16, I8]>:$scale);
   let results = (outs RankedTensorOf<[AnyTypeOf<[F16, BF16, F32]>]>:$output);
 
   let assemblyFormat = [{
     $input `scale` $scale attr-dict
         `:` type($input) `,` type($scale) `->` type($output)
   }];
 }
+
 //===----------------------------------------------------------------------===//
 // InThreadTransposeOp
 //===----------------------------------------------------------------------===//
 
@@ -2,6 +2,7 @@
 #define TRITON_THIRD_PARTY_AMD_INCLUDE_TRITONAMDGPUTOLLVM_PATTERNTRITONAMDGPUTOLLVM_H_
 
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.h"
 
 namespace mlir::triton::AMD {
 
@@ -17,7 +18,7 @@ void populateConcatOpToLLVMPatterns(mlir::LLVMTypeConverter &typeConverter,
 
 void populateScaledUpcastOpToLLVMPatterns(
     mlir::LLVMTypeConverter &typeConverter, mlir::RewritePatternSet &patterns,
-    mlir::PatternBenefit benefit);
+    const AMD::TargetInfo &targetInfo, mlir::PatternBenefit benefit);
 
 } // namespace mlir::triton::AMD