@@ -73,10 +73,10 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32}
73
73
// CHECK: %[[SUB_GROUP_ID_RAW:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() {no_unwind, will_return} : () -> i32
74
74
// CHECK: %[[SUB_GROUP_ID_EXT:.*]] = llvm.zext %[[SUB_GROUP_ID_RAW]] : i32 to i64
75
75
// CHECK: %[[SUB_GROUP_ID:.*]] = llvm.trunc %[[SUB_GROUP_ID_EXT]] : i64 to i32
76
- // CHECK: %[[VAL_18:.*]] = llvm.mlir.constant(2 : i32) : i32
76
+ // CHECK: %[[VAL_18:.*]] = llvm.mlir.constant(1 : i32) : i32
77
77
// CHECK: %[[VAL_19:.*]] = llvm.urem %[[SUB_GROUP_ID]], %[[VAL_18]] : i32
78
78
// CHECK: %[[VAL_20:.*]] = llvm.udiv %[[SUB_GROUP_ID]], %[[VAL_18]] : i32
79
- // CHECK: %[[CST_8:.*]] = llvm.mlir.constant(4 : i32) : i32
79
+ // CHECK: %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32
80
80
// CHECK: %[[VAL_22:.*]] = llvm.urem %[[VAL_20]], %[[CST_8]] : i32
81
81
// CHECK: %[[VAL_23:.*]] = llvm.udiv %[[VAL_20]], %[[CST_8]] : i32
82
82
// CHECK: %[[OFFSET_0:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][0] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
@@ -94,20 +94,20 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32}
94
94
// CHECK: %[[VAL_24:.*]] = llvm.mul %[[COL_STRIDE_i64]], %[[CST_2]] : i64
95
95
// CHECK: %[[COL_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_24]] : i64 to i32
96
96
// CHECK: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
97
- // CHECK: %[[CST_32:.*]] = llvm.mlir.constant(16 : i32) : i32
97
+ // CHECK: %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32
98
98
// CHECK: %[[VAL_26:.*]] = llvm.mul %[[VAL_19]], %[[CST_32]] : i32
99
99
// CHECK: %[[VAL_27:.*]] = llvm.add %[[VAL_26]], %[[CST_0]] : i32
100
100
// CHECK: %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32
101
101
// CHECK: %[[VAL_28:.*]] = llvm.urem %[[VAL_27]], %[[CST_32]] : i32
102
102
// CHECK: %[[COL_MAJOR_OFFSET_X:.*]] = llvm.add %[[VAL_28]], %[[OFFSET_1]] : i32
103
103
// CHECK: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
104
- // CHECK: %[[CST_2:.*]] = llvm.mlir.constant(4 : i32) : i32
104
+ // CHECK: %[[CST_2:.*]] = llvm.mlir.constant(2 : i32) : i32
105
105
// CHECK: %[[VAL_30:.*]] = llvm.mul %[[VAL_22]], %[[CST_2]] : i32
106
106
// CHECK: %[[VAL_31:.*]] = llvm.add %[[VAL_30]], %[[CST_0]] : i32
107
107
// CHECK: %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32
108
108
// CHECK: %[[VAL_32:.*]] = llvm.urem %[[VAL_31]], %[[CST_16]] : i32
109
109
// CHECK: %[[COL_MAJOR_OFFSET_Y:.*]] = llvm.add %[[VAL_32]], %[[OFFSET_0]] : i32
110
- // CHECK: triton_gen.2Dblockprefetch %[[BASE_]], %[[COL_MAJOR_BASE_WIDTH]], %[[COL_MAJOR_BASE_HEIGHT]], %[[COL_MAJOR_PITCH]], %[[COL_MAJOR_OFFSET_X]], %[[COL_MAJOR_OFFSET_Y]] {elem_size_in_bits = 16, tile_width = 16, tile_height = 4 , v_blocks = 1 , cache_control = L1C_L3C}
110
+ // CHECK: triton_gen.2Dblockprefetch %[[BASE_]], %[[COL_MAJOR_BASE_WIDTH]], %[[COL_MAJOR_BASE_HEIGHT]], %[[COL_MAJOR_PITCH]], %[[COL_MAJOR_OFFSET_X]], %[[COL_MAJOR_OFFSET_Y]] {elem_size_in_bits = 16, tile_width = 16, tile_height = 2 , v_blocks = 2 , cache_control = L1C_L3C}
111
111
%columnMajorPtr = tt.make_tensor_ptr %arg0 , [%arg4 , %arg2 ], [%c1_i64 , %arg5 ], [%c0_i32 , %c0_i32 ] {order = array<i32 : 0 , 1 >} : <tensor <32 x16 xf16 >>
112
112
ttig.prefetch %columnMajorPtr {cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , ttig.block_io = " column_major" } : !tt.ptr <tensor <32 x16 xf16 >>
113
113
0 commit comments