|
57 | 57 | // CHECK: scf.yield {{.*}}, %[[INS_IDX_3]], %[[EXT_IDX_3]], %[[NEXT_A]], %[[NEXT_B]] |
58 | 58 |
|
59 | 59 | // AMD-LABEL: tt.func @matmul_loop |
| 60 | +// AMD-DAG: %[[CM1:.*]] = arith.constant -1 : index |
| 61 | +// AMD-DAG: %[[C1:.*]] = arith.constant 1 : index |
60 | 62 | // AMD-DAG: %[[C0:.*]] = arith.constant 0 : index |
61 | | -// AMD: %{{.*}}:6 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}) |
| 63 | +// AMD: %[[UB1:.*]] = arith.subi %[[UB:.*]], %arg2 : index |
| 64 | +// AMD: %[[FOR:.*]]:6 = scf.for %[[ARG5:.*]] = %[[LB:.*]] to %[[UB1]] step %[[STEP:.*]] iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}) |
62 | 65 | // AMD: %[[LOCAL_LOAD_32:.*]] = triton_gpu.local_load %[[ARG10]] |
63 | 66 | // AMD: %[[LOCAL_LOAD_33:.*]] = triton_gpu.local_load %[[ARG11]] |
64 | 67 | // AMD: %[[MULF_34:.*]] = arith.mulf %[[LOCAL_LOAD_33]], %{{.*}} |
|
76 | 79 | // AMD: triton_gpu.local_store %[[LOAD_39]], %[[MEMDESC_SUBVIEW_44]] |
77 | 80 | // AMD: scf.yield %[[ADDPTR_36]], %[[ADDPTR_37]], %[[DOT_35]], %[[SELECT_42]], %[[MEMDESC_SUBVIEW_43]], %[[MEMDESC_SUBVIEW_44]] |
78 | 81 | // AMD: } |
79 | | -// AMD: %[[SUBI_21:.*]] = arith.subi %{{.*}}, %{{.*}} |
80 | | -// AMD: %[[ADDI_22:.*]] = arith.addi %[[SUBI_21]], %{{.*}} |
81 | | -// AMD: %[[ADDI_23:.*]] = arith.addi %[[ADDI_22]], %{{.*}}-1 |
82 | | -// AMD: %[[DIVUI_24:.*]] = arith.divui %[[ADDI_23]], %{{.*}} |
83 | | -// AMD: %[[ADDI_25:.*]] = arith.addi %[[DIVUI_24]], %{{.*}}-1 |
84 | | -// AMD: %[[CMPI_26:.*]] = arith.cmpi sge, %[[ADDI_25]], %[[C0]] |
85 | | -// AMD: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %{{.*}}#4 |
86 | | -// AMD: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %{{.*}}#5 |
| 82 | +// AMD: %[[CMPI_21:.*]] = arith.cmpi slt, %[[STEP]], %[[C0]] |
| 83 | +// AMD: %[[SELECT_22:.*]] = arith.select %[[CMPI_21]], %[[C1]], %[[CM1]] |
| 84 | +// AMD: %[[SUBI_23:.*]] = arith.subi %[[UB]], %[[LB]] |
| 85 | +// AMD: %[[ADDI_24:.*]] = arith.addi %[[SUBI_23]], %[[STEP]] |
| 86 | +// AMD: %[[ADDI_25:.*]] = arith.addi %[[ADDI_24]], %[[SELECT_22]] |
| 87 | +// AMD: %[[DIVUI_26:.*]] = arith.divui %[[ADDI_25]], %[[STEP]] |
| 88 | +// AMD: %[[ADDI_27:.*]] = arith.addi %[[DIVUI_26]], %[[CM1]] |
| 89 | +// AMD: %[[CMPI_28:.*]] = arith.cmpi sge, %[[ADDI_27]], %[[C0]] |
| 90 | +// AMD: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[FOR]]#4 |
| 91 | +// AMD: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[FOR]]#5 |
87 | 92 | // AMD: %[[MULF_29:.*]] = arith.mulf %[[LOCAL_LOAD_28]], %{{.*}} |
88 | | -// AMD: %[[IF_30:.*]] = scf.if %[[CMPI_26]] |
89 | | -// AMD: %[[DOT_32:.*]] = tt.dot %[[LOCAL_LOAD_27]], %[[MULF_29]], %{{.*}}#2 |
| 93 | +// AMD: %[[IF_30:.*]] = scf.if %[[CMPI_28]] |
| 94 | +// AMD: %[[DOT_32:.*]] = tt.dot %[[LOCAL_LOAD_27]], %[[MULF_29]], %[[FOR]]#2 |
90 | 95 | // AMD: scf.yield %[[DOT_32]] |
91 | 96 | // AMD: } else { |
92 | | -// AMD: scf.yield %{{.*}}#2 |
| 97 | +// AMD: scf.yield %[[FOR]]#2 |
93 | 98 | // AMD: } |
94 | | -// AMD: %[[SELECT_31:.*]] = arith.select %[[CMPI_26]], %[[IF_30]], %{{.*}}#2 |
| 99 | +// AMD: %[[SELECT_31:.*]] = arith.select %[[CMPI_28]], %[[IF_30]], %[[FOR]]#2 |
95 | 100 | // AMD: triton_gpu.local_dealloc %{{.*}} |
96 | 101 | // AMD: triton_gpu.local_dealloc %{{.*}} |
97 | 102 |
|
|
0 commit comments