8484// AMD: %[[SUBI_23:.*]] = arith.subi %[[UB]], %[[LB]]
8585// AMD: %[[ADDI_24:.*]] = arith.addi %[[SUBI_23]], %[[STEP]]
8686// AMD: %[[ADDI_25:.*]] = arith.addi %[[ADDI_24]], %[[SELECT_22]]
87- // AMD: %[[DIVUI_26:.*]] = arith.divsi %[[ADDI_25]], %[[STEP]]
88- // AMD: %[[ADDI_27:.*]] = arith.addi %[[DIVUI_26]], %[[CM1]]
89- // AMD: %[[CMPI_28:.*]] = arith.cmpi sge, %[[ADDI_27]], %[[C0]]
90- // AMD: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[FOR]]#4
91- // AMD: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[FOR]]#5
92- // AMD: %[[MULF_29:.*]] = arith.mulf %[[LOCAL_LOAD_28]], %{{.*}}
93- // AMD: %[[IF_30:.*]] = scf.if %[[CMPI_28]]
94- // AMD: %[[DOT_32:.*]] = tt.dot %[[LOCAL_LOAD_27]], %[[MULF_29]], %[[FOR]]#2
95- // AMD: scf.yield %[[DOT_32]]
87+ // AMD: %[[DIVSI_26:.*]] = arith.divsi %[[ADDI_25]], %[[STEP]]
88+ // AMD: %[[CMPI_27:.*]] = arith.cmpi sge, %[[DIVSI_26]], %{{.*}}
89+ // AMD: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %{{.*}}#4
90+ // AMD: %[[LOCAL_LOAD_29:.*]] = triton_gpu.local_load %{{.*}}#5
91+ // AMD: %[[MULF_30:.*]] = arith.mulf %[[LOCAL_LOAD_29]], %{{.*}}
92+ // AMD: %[[IF_31:.*]] = scf.if %[[CMPI_27]]
93+ // AMD: %[[DOT_33:.*]] = tt.dot %[[LOCAL_LOAD_28]], %[[MULF_30]], %{{.*}}#2
94+ // AMD: scf.yield %[[DOT_33]]
9695// AMD: } else {
97- // AMD: scf.yield %[[FOR]] #2
96+ // AMD: scf.yield %{{.*}} #2
9897// AMD: }
99- // AMD: %[[SELECT_31 :.*]] = arith.select %[[CMPI_28 ]], %[[IF_30 ]], %[[FOR]] #2
98+ // AMD: %[[SELECT_32 :.*]] = arith.select %[[CMPI_27 ]], %[[IF_31 ]], %{{.*}} #2
10099// AMD: triton_gpu.local_dealloc %{{.*}}
101100// AMD: triton_gpu.local_dealloc %{{.*}}
102101
@@ -414,35 +413,33 @@ tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index,
414413// AMD: triton_gpu.local_store %[[ARG14]], %[[MEMDESC_SUBVIEW_58]]
415414// AMD: scf.yield %[[DOT_45]], %[[ADDPTR_46]], %[[ADDPTR_47]], %[[SELECT_56]], %[[MEMDESC_SUBVIEW_57]], %[[MEMDESC_SUBVIEW_58]], %[[LOAD_48]], %[[LOAD_53]]
416415// AMD: }
417- // AMD: %[[ADDI_26:.*]] = arith.addi %{{.*}}, %{{.*}}-1
418- // AMD: %[[CMPI_27:.*]] = arith.cmpi sge, %[[ADDI_26]], %{{.*}}
419- // AMD: %[[ADDI_28:.*]] = arith.addi %{{.*}}, %{{.*}}-2
420- // AMD: %[[CMPI_29:.*]] = arith.cmpi sge, %[[ADDI_28]], %{{.*}}
421- // AMD: %[[LOCAL_LOAD_30:.*]] = triton_gpu.local_load %{{.*}}#4
422- // AMD: %[[LOCAL_LOAD_31:.*]] = triton_gpu.local_load %{{.*}}#5
423- // AMD: %[[IF_32:.*]] = scf.if %[[CMPI_27]]
424- // AMD: %[[DOT_43:.*]] = tt.dot %[[LOCAL_LOAD_30]], %[[LOCAL_LOAD_31]], %{{.*}}#0
425- // AMD: scf.yield %[[DOT_43]]
416+ // AMD: %[[CMPI_26:.*]] = arith.cmpi sge, %{{.*}}, %{{.*}}
417+ // AMD: %[[CMPI_27:.*]] = arith.cmpi sge, %{{.*}}, %{{.*}}
418+ // AMD: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %{{.*}}#4
419+ // AMD: %[[LOCAL_LOAD_29:.*]] = triton_gpu.local_load %{{.*}}#5
420+ // AMD: %[[IF_30:.*]] = scf.if %[[CMPI_26]]
421+ // AMD: %[[DOT_41:.*]] = tt.dot %[[LOCAL_LOAD_28]], %[[LOCAL_LOAD_29]], %{{.*}}#0
422+ // AMD: scf.yield %[[DOT_41]]
426423// AMD: } else {
427- // AMD: scf.yield %{{.*}}#0
424+ // AMD: scf.yield %{{.*}}#0
428425// AMD: }
429- // AMD: %[[ADDI_33 :.*]] = arith.addi %{{.*}}#3, %{{.*}}
430- // AMD: %[[CMPI_34 :.*]] = arith.cmpi slt, %[[ADDI_33 ]], %{{.*}}
431- // AMD: %[[SELECT_35 :.*]] = arith.select %[[CMPI_34 ]], %[[ADDI_33 ]], %{{.*}}
432- // AMD: %[[MEMDESC_SUBVIEW_36 :.*]] = triton_gpu.memdesc_subview %{{.*}}[ %[[SELECT_35 ]], %{{.*}}, %{{.*}}]
433- // AMD: triton_gpu.local_store %{{.*}}#6, %[[MEMDESC_SUBVIEW_36 ]]
434- // AMD: %[[MEMDESC_SUBVIEW_37 :.*]] = triton_gpu.memdesc_subview %{{.*}}[ %[[SELECT_35 ]], %{{.*}}, %{{.*}}]
435- // AMD: triton_gpu.local_store %{{.*}}#7, %[[MEMDESC_SUBVIEW_37 ]]
436- // AMD: %[[SELECT_38 :.*]] = arith.select %[[CMPI_27 ]], %[[IF_32 ]], %{{.*}}#0
437- // AMD: %[[LOCAL_LOAD_39 :.*]] = triton_gpu.local_load %[[MEMDESC_SUBVIEW_36 ]]
438- // AMD: %[[LOCAL_LOAD_40 :.*]] = triton_gpu.local_load %[[MEMDESC_SUBVIEW_37 ]]
439- // AMD: %[[IF_41 :.*]] = scf.if %[[CMPI_29 ]]
440- // AMD: %[[DOT_43 :.*]] = tt.dot %[[LOCAL_LOAD_39 ]], %[[LOCAL_LOAD_40 ]], %[[SELECT_38 ]]
441- // AMD: scf.yield %[[DOT_43 ]]
426+ // AMD: %[[ADDI_31 :.*]] = arith.addi %{{.*}}#3, %{{.*}}
427+ // AMD: %[[CMPI_32 :.*]] = arith.cmpi slt, %[[ADDI_31 ]], %{{.*}}
428+ // AMD: %[[SELECT_33 :.*]] = arith.select %[[CMPI_32 ]], %[[ADDI_31 ]], %{{.*}}
429+ // AMD: %[[MEMDESC_SUBVIEW_34 :.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][ %[[SELECT_33 ]], %{{.*}}, %{{.*}}]
430+ // AMD: triton_gpu.local_store %{{.*}}#6, %[[MEMDESC_SUBVIEW_34 ]]
431+ // AMD: %[[MEMDESC_SUBVIEW_35 :.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][ %[[SELECT_33 ]], %{{.*}}, %{{.*}}]
432+ // AMD: triton_gpu.local_store %{{.*}}#7, %[[MEMDESC_SUBVIEW_35 ]]
433+ // AMD: %[[SELECT_36 :.*]] = arith.select %[[CMPI_26 ]], %[[IF_30 ]], %{{.*}}#0
434+ // AMD: %[[LOCAL_LOAD_37 :.*]] = triton_gpu.local_load %[[MEMDESC_SUBVIEW_34 ]]
435+ // AMD: %[[LOCAL_LOAD_38 :.*]] = triton_gpu.local_load %[[MEMDESC_SUBVIEW_35 ]]
436+ // AMD: %[[IF_39 :.*]] = scf.if %[[CMPI_27 ]]
437+ // AMD: %[[DOT_41 :.*]] = tt.dot %[[LOCAL_LOAD_37 ]], %[[LOCAL_LOAD_38 ]], %[[SELECT_36 ]]
438+ // AMD: scf.yield %[[DOT_41 ]]
442439// AMD: } else {
443- // AMD: scf.yield %[[SELECT_38 ]]
440+ // AMD: scf.yield %[[SELECT_36 ]]
444441// AMD: }
445- // AMD: %[[SELECT_42 :.*]] = arith.select %[[CMPI_29 ]], %[[IF_41 ]], %[[SELECT_38 ]]
442+ // AMD: %[[SELECT_40 :.*]] = arith.select %[[CMPI_27 ]], %[[IF_39 ]], %[[SELECT_36 ]]
446443// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_0]]
447444// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_1]]
448445
@@ -976,6 +973,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
976973
977974// AMD-DIS: #[[$SHARED_LAYOUT:shared.*]] = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], hasLeadingOffset = false}>
978975// AMD-LABEL: tt.func @indirect_load_shared_layout
976+ // AMD: %[[LOCAL_ALLOC_0:.*]] = triton_gpu.local_alloc
977+ // AMD: %[[LOCAL_ALLOC_1:.*]] = triton_gpu.local_alloc
979978// AMD: %{{.*}}:7 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}})
980979// AMD: %[[LOCAL_LOAD_47:.*]] = triton_gpu.local_load %[[ARG11]]
981980// AMD: %[[LOCAL_LOAD_48:.*]] = triton_gpu.local_load %[[ARG12]]
@@ -998,44 +997,42 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
998997// AMD: triton_gpu.local_store %[[LOAD_57]], %[[MEMDESC_SUBVIEW_63]]
999998// AMD: scf.yield %[[DOT_49]], %[[ADDPTR_50]], %[[ADDPTR_51]], %[[SELECT_61]], %[[MEMDESC_SUBVIEW_62]], %[[MEMDESC_SUBVIEW_63]], %[[LOAD_58]]
1000999// AMD: }
1001- // AMD: %[[ADDI_21:.*]] = arith.addi %{{.*}}, %{{.*}}-1
1002- // AMD: %[[CMPI_22:.*]] = arith.cmpi sge, %[[ADDI_21]], %{{.*}}
1003- // AMD: %[[ADDI_23:.*]] = arith.addi %{{.*}}, %{{.*}}-2
1004- // AMD: %[[CMPI_24:.*]] = arith.cmpi sge, %[[ADDI_23]], %{{.*}}
1005- // AMD: %[[LOCAL_LOAD_25:.*]] = triton_gpu.local_load %{{.*}}#4
1006- // AMD: %[[LOCAL_LOAD_26:.*]] = triton_gpu.local_load %{{.*}}#5
1007- // AMD: %[[IF_27:.*]] = scf.if %[[CMPI_22]]
1008- // AMD: %[[DOT_47:.*]] = tt.dot %[[LOCAL_LOAD_25]], %[[LOCAL_LOAD_26]], %{{.*}}#0
1009- // AMD: scf.yield %[[DOT_47]]
1000+ // AMD: %[[CMPI_21:.*]] = arith.cmpi sge, %{{.*}}, %{{.*}}
1001+ // AMD: %[[CMPI_22:.*]] = arith.cmpi sge, %{{.*}}, %{{.*}}
1002+ // AMD: %[[LOCAL_LOAD_23:.*]] = triton_gpu.local_load %{{.*}}#4
1003+ // AMD: %[[LOCAL_LOAD_24:.*]] = triton_gpu.local_load %{{.*}}#5
1004+ // AMD: %[[IF_25:.*]] = scf.if %[[CMPI_21]]
1005+ // AMD: %[[DOT_45:.*]] = tt.dot %[[LOCAL_LOAD_23]], %[[LOCAL_LOAD_24]], %{{.*}}#0
1006+ // AMD: scf.yield %[[DOT_45]]
10101007// AMD: } else {
10111008// AMD: scf.yield %{{.*}}#0
10121009// AMD: }
1013- // AMD: %[[ADDPTR_28 :.*]] = tt.addptr %{{.*}}#1, %{{.*}}
1014- // AMD: %[[SPLAT_29 :.*]] = tt.splat %[[CMPI_24 ]]
1015- // AMD: %[[LOAD_30 :.*]] = tt.load %[[ADDPTR_28 ]], %[[SPLAT_29 ]]
1016- // AMD: %[[EXPAND_DIMS_31 :.*]] = tt.expand_dims %{{.*}}#6 {axis = 1 : i32}
1017- // AMD: %[[BROADCAST_32 :.*]] = tt.broadcast %[[EXPAND_DIMS_31 ]]
1018- // AMD: %[[MULI_33 :.*]] = arith.muli %{{.*}}, %[[BROADCAST_32 ]]
1019- // AMD: %[[ADDPTR_34 :.*]] = tt.addptr %{{.*}}, %[[MULI_33 ]]
1020- // AMD: %[[SPLAT_35 :.*]] = tt.splat %[[CMPI_24 ]]
1021- // AMD: %[[LOAD_36 :.*]] = tt.load %[[ADDPTR_34 ]], %[[SPLAT_35 ]]
1022- // AMD: %[[ADDI_37 :.*]] = arith.addi %{{.*}}#3, %{{.*}}
1023- // AMD: %[[CMPI_38 :.*]] = arith.cmpi slt, %[[ADDI_37 ]], %{{.*}}
1024- // AMD: %[[SELECT_39 :.*]] = arith.select %[[CMPI_38 ]], %[[ADDI_37 ]], %{{.*}}
1025- // AMD: %[[MEMDESC_SUBVIEW_40 :.*]] = triton_gpu.memdesc_subview %{{.*}}[ %[[SELECT_39 ]], %{{.*}}, %{{.*}}]
1026- // AMD: triton_gpu.local_store %[[LOAD_30 ]], %[[MEMDESC_SUBVIEW_40 ]]
1027- // AMD: %[[MEMDESC_SUBVIEW_41 :.*]] = triton_gpu.memdesc_subview %{{.*}}[ %[[SELECT_39 ]], %{{.*}}, %{{.*}}]
1028- // AMD: triton_gpu.local_store %[[LOAD_36 ]], %[[MEMDESC_SUBVIEW_41 ]]
1029- // AMD: %[[SELECT_42 :.*]] = arith.select %[[CMPI_22 ]], %[[IF_27 ]], %{{.*}}#0
1030- // AMD: %[[LOCAL_LOAD_43 :.*]] = triton_gpu.local_load %[[MEMDESC_SUBVIEW_40 ]]
1031- // AMD: %[[LOCAL_LOAD_44 :.*]] = triton_gpu.local_load %[[MEMDESC_SUBVIEW_41 ]]
1032- // AMD: %[[IF_45 :.*]] = scf.if %[[CMPI_24 ]]
1033- // AMD: %[[DOT_47 :.*]] = tt.dot %[[LOCAL_LOAD_43 ]], %[[LOCAL_LOAD_44 ]], %[[SELECT_42 ]]
1034- // AMD: scf.yield %[[DOT_47 ]]
1010+ // AMD: %[[ADDPTR_26 :.*]] = tt.addptr %{{.*}}#1, %{{.*}}
1011+ // AMD: %[[SPLAT_27 :.*]] = tt.splat %[[CMPI_22 ]]
1012+ // AMD: %[[LOAD_28 :.*]] = tt.load %[[ADDPTR_26 ]], %[[SPLAT_27 ]]
1013+ // AMD: %[[EXPAND_DIMS_29 :.*]] = tt.expand_dims %{{.*}}#6 {axis = 1 : i32}
1014+ // AMD: %[[BROADCAST_30 :.*]] = tt.broadcast %[[EXPAND_DIMS_29 ]]
1015+ // AMD: %[[MULI_31 :.*]] = arith.muli %{{.*}}, %[[BROADCAST_30 ]]
1016+ // AMD: %[[ADDPTR_32 :.*]] = tt.addptr %{{.*}}, %[[MULI_31 ]]
1017+ // AMD: %[[SPLAT_33 :.*]] = tt.splat %[[CMPI_22 ]]
1018+ // AMD: %[[LOAD_34 :.*]] = tt.load %[[ADDPTR_32 ]], %[[SPLAT_33 ]]
1019+ // AMD: %[[ADDI_35 :.*]] = arith.addi %{{.*}}#3, %{{.*}}
1020+ // AMD: %[[CMPI_36 :.*]] = arith.cmpi slt, %[[ADDI_35 ]], %{{.*}}
1021+ // AMD: %[[SELECT_37 :.*]] = arith.select %[[CMPI_36 ]], %[[ADDI_35 ]], %{{.*}}
1022+ // AMD: %[[MEMDESC_SUBVIEW_38 :.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][ %[[SELECT_37 ]], %{{.*}}, %{{.*}}]
1023+ // AMD: triton_gpu.local_store %[[LOAD_28 ]], %[[MEMDESC_SUBVIEW_38 ]]
1024+ // AMD: %[[MEMDESC_SUBVIEW_39 :.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][ %[[SELECT_37 ]], %{{.*}}, %{{.*}}]
1025+ // AMD: triton_gpu.local_store %[[LOAD_34 ]], %[[MEMDESC_SUBVIEW_39 ]]
1026+ // AMD: %[[SELECT_40 :.*]] = arith.select %[[CMPI_21 ]], %[[IF_25 ]], %{{.*}}#0
1027+ // AMD: %[[LOCAL_LOAD_41 :.*]] = triton_gpu.local_load %[[MEMDESC_SUBVIEW_38 ]]
1028+ // AMD: %[[LOCAL_LOAD_42 :.*]] = triton_gpu.local_load %[[MEMDESC_SUBVIEW_39 ]]
1029+ // AMD: %[[IF_43 :.*]] = scf.if %[[CMPI_22 ]]
1030+ // AMD: %[[DOT_45 :.*]] = tt.dot %[[LOCAL_LOAD_41 ]], %[[LOCAL_LOAD_42 ]], %[[SELECT_40 ]]
1031+ // AMD: scf.yield %[[DOT_45 ]]
10351032// AMD: } else {
1036- // AMD: scf.yield %[[SELECT_42 ]]
1033+ // AMD: scf.yield %[[SELECT_40 ]]
10371034// AMD: }
1038- // AMD: %[[SELECT_46 :.*]] = arith.select %[[CMPI_24 ]], %[[IF_45 ]], %[[SELECT_42 ]]
1035+ // AMD: %[[SELECT_44 :.*]] = arith.select %[[CMPI_22 ]], %[[IF_43 ]], %[[SELECT_40 ]]
10391036// AMD: triton_gpu.local_dealloc %{{.*}}
10401037// AMD: triton_gpu.local_dealloc %{{.*}}
10411038
0 commit comments