@@ -10,24 +10,26 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.sha
1010 tt.func public @convert_dpas (%arg0: !tt.ptr <f16 > {tt.divisibility = 16 : i32 }) attributes {noinline = false } {
1111 %cst = arith.constant dense <0.000000e+00 > : tensor <128 x256 xf16 , #mma >
1212
13- // CHECK-DAG: %[[CST_3:.*]] = llvm.mlir.constant(3 : i32) : i32
14- // CHECK-DAG: %[[CST_16384:.*]] = llvm.mlir.constant(16384 : i32) : i32
15- // CHECK-DAG: %[[CST_8192:.*]] = llvm.mlir.constant(8192 : i32) : i32
16- // CHECK-DAG: %[[CST_128:.*]] = llvm.mlir.constant(128 : i32) : i32
17- // CHECK-DAG: %[[CST_64:.*]] = llvm.mlir.constant(64 : i32) : i32
18- // CHECK-DAG: %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32
19- // CHECK-DAG: %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32
20- // CHECK-DAG: %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32
21- // CHECK-DAG: %[[CST_2:.*]] = llvm.mlir.constant(2 : i32) : i32
22- // CHECK-DAG: %[[CST_1:.*]] = llvm.mlir.constant(1 : i32) : i32
23- // CHECK-DAG: %[[SMEM:.*]] = llvm.mlir.addressof @global_smem : !llvm.ptr<3>
24- // CHECK-DAG: %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32
25- // CHECK-DAG: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
13+ // CHECK-DAG: %[[CST_3:.*]] = llvm.mlir.constant(3 : i32) : i32
14+ // CHECK-DAG: %[[CST_16384:.*]] = llvm.mlir.constant(16384 : i32) : i32
15+ // CHECK-DAG: %[[CST_8192:.*]] = llvm.mlir.constant(8192 : i32) : i32
16+ // CHECK-DAG: %[[CST_128:.*]] = llvm.mlir.constant(128 : i32) : i32
17+ // CHECK-DAG: %[[CST_64:.*]] = llvm.mlir.constant(64 : i32) : i32
18+ // CHECK-DAG: %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32
19+ // CHECK-DAG: %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32
20+ // CHECK-DAG: %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32
21+ // CHECK-DAG: %[[CST_2:.*]] = llvm.mlir.constant(2 : i32) : i32
22+ // CHECK-DAG: %[[CST_1:.*]] = llvm.mlir.constant(1 : i32) : i32
23+ // CHECK-DAG: %[[SMEM:.*]] = llvm.mlir.addressof @global_smem : !llvm.ptr<3>
24+ // CHECK-DAG: %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32
25+ // CHECK-DAG: %[[CST_511:.*]] = llvm.mlir.constant(511 : i32) : i32
26+ // CHECK-DAG: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
2627 // COM: The following operations is generated for the conversion of DPAS layout to blocked layout. The conversion replica size is 128*256. So there is 1 round of load/store with synchronization.
2728 // CHECK: %[[threadId_64:.*]] = llvm.call spir_funccc @_Z12get_local_idj(%[[CST_0]]) {memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>, no_unwind, will_return} : (i32) -> i64
2829 // CHECK: %[[threadId:.*]] = llvm.trunc %[[threadId_64]] : i64 to i32
29- // CHECK: %[[laneId:.*]] = llvm.urem %[[threadId]], %[[CST_16]] : i32
30- // CHECK: %[[warpId:.*]] = llvm.udiv %[[threadId]], %[[CST_16]] : i32
30+ // CHECK: %[[rtid:.*]] = llvm.and %[[threadId:.*]], %[[CST_511]] : i32
31+ // CHECK: %[[laneId:.*]] = llvm.urem %[[rtid]], %[[CST_16]] : i32
32+ // CHECK: %[[warpId:.*]] = llvm.udiv %[[rtid]], %[[CST_16]] : i32
3133 // CHECK: %[[VAL_25:.*]] = llvm.and %[[laneId]], %[[CST_1]] : i32
3234 // CHECK: %[[VAL_26:.*]] = llvm.icmp "eq" %[[VAL_25]], %[[CST_0]] : i32
3335 // CHECK: %[[VAL_27:.*]] = llvm.select %[[VAL_26]], %[[CST_0]], %[[CST_1]] : i1, i32
@@ -115,12 +117,14 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.sha
115117 // CHECK-DAG: %[[SMEM:.*]] = llvm.mlir.addressof @global_smem : !llvm.ptr<3>
116118 // CHECK-DAG: %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32
117119 // CHECK-DAG: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
120+ // CHECK-DAG: %[[CST_511:.*]] = llvm.mlir.constant(511 : i32) : i32
118121
119122 // COM: The following operations is generated for the conversion of DPAS layout to blocked layout. The conversion replica size is 64*256. So there are 2 round of load/store with synchronization.
120123 // CHECK: %[[threadId_64:.*]] = llvm.call spir_funccc @_Z12get_local_idj(%[[CST_0]]) {memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>, no_unwind, will_return} : (i32) -> i64
121124 // CHECK: %[[threadId:.*]] = llvm.trunc %[[threadId_64]] : i64 to i32
122- // CHECK: %[[laneId:.*]] = llvm.urem %[[threadId]], %[[CST_16]] : i32
123- // CHECK: %[[warpId:.*]] = llvm.udiv %[[threadId]], %[[CST_16]] : i32
125+ // CHECK: %[[rtid:.*]] = llvm.and %[[threadId]], %[[CST_511]] : i32
126+ // CHECK: %[[laneId:.*]] = llvm.urem %[[rtid]], %[[CST_16]] : i32
127+ // CHECK: %[[warpId:.*]] = llvm.udiv %[[rtid]], %[[CST_16]] : i32
124128 // CHECK: %[[VAL_25:.*]] = llvm.and %[[laneId]], %[[CST_1]] : i32
125129 // CHECK: %[[VAL_26:.*]] = llvm.icmp "eq" %[[VAL_25]], %[[CST_0]] : i32
126130 // CHECK: %[[VAL_27:.*]] = llvm.select %[[VAL_26]], %[[CST_0]], %[[CST_1]] : i1, i32
0 commit comments