Skip to content

Commit 267c32c

Browse files
committed
back to scheduledag mutation
1 parent e6827be commit 267c32c

File tree

4 files changed

+96
-5
lines changed

4 files changed

+96
-5
lines changed

llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3363,9 +3363,9 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
33633363
// Already computed the OpcodeOffset table, just index into it.
33643364
if (N.getOpcode() < OpcodeOffset.size())
33653365
MatcherIndex = OpcodeOffset[N.getOpcode()];
3366-
if (N->getOpcode() == ISD::FADD) {
3367-
MatcherIndex = 0;
3368-
}
3366+
// if (N->getOpcode() == ISD::FADD) {
3367+
// MatcherIndex = 0;
3368+
// }
33693369
LLVM_DEBUG(dbgs() << " Initial Opcode index to " << MatcherIndex << "\n");
33703370
} else if (MatcherTable[0] == OPC_SwitchOpcode) {
33713371
// Otherwise, the table isn't computed, but the state machine does start

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1143,8 +1143,9 @@ struct MaxsUnpackPackedF32OpsDAGMutation : ScheduleDAGMutation {
11431143
const TargetInstrInfo &TII = *DAG->TII;
11441144
const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
11451145
for (auto &I : *DAG) {
1146-
if (I.getOpcode() == AMDGPU::V_PK_ADD_F32)
1146+
if (I.getOpcode() == AMDGPU::V_PK_ADD_F32) {
11471147
I.dump();
1148+
}
11481149
I.dump();
11491150
}
11501151
llvm::dbgs() << "Completed MaxsUnpackPackedF32OpsDAGMutation\n";

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1153,7 +1153,7 @@ let isCommutable = 1, isReMaterializable = 1 in {
11531153
let SubtargetPredicate = HasPackedFP32Ops in {
11541154
defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fma>;
11551155
defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fmul>;
1156-
defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, my_any_fadd>;
1156+
defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fadd>;
11571157
} // End SubtargetPredicate = HasPackedFP32Ops
11581158

11591159
let SubtargetPredicate = HasPkMovB32, isAsCheapAsAMove = 1 in

llvm/lib/Target/AMDGPU/test_v_pk.ll

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
; /home/mlevental/dev_projects/llvm-project/llvm/lib/Target/AMDGPU/test_v_pk.ll -mtriple=amdgcn -mcpu=gfx942 -o -
2+
; /home/mlevental/dev_projects/llvm-project/llvm/lib/Target/AMDGPU/test_v_pk.ll -mattr=-packed-fp32-ops -mtriple=amdgcn -mcpu=gfx942 -o -
3+
4+
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
5+
define amdgpu_kernel void @add_kernel(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture readonly %1, ptr addrspace(1) nocapture writeonly %2, i32 %3) local_unnamed_addr #0 {
6+
%5 = tail call i32 @llvm.amdgcn.workgroup.id.x()
7+
%6 = shl i32 %5, 10
8+
%7 = tail call i32 @llvm.amdgcn.workitem.id.x()
9+
%8 = shl i32 %7, 2
10+
%9 = and i32 %8, 1020
11+
%10 = or disjoint i32 %9, %6
12+
%11 = icmp slt i32 %10, %3
13+
br i1 %11, label %.critedge, label %.critedge2
14+
15+
.critedge: ; preds = %4
16+
%12 = or disjoint i32 %10, 3
17+
%13 = or disjoint i32 %10, 2
18+
%14 = or disjoint i32 %10, 1
19+
%15 = sext i32 %10 to i64
20+
%16 = getelementptr float, ptr addrspace(1) %0, i64 %15
21+
%17 = addrspacecast ptr addrspace(1) %16 to ptr
22+
%18 = load float, ptr %17, align 16
23+
%19 = getelementptr inbounds i8, ptr %17, i64 4
24+
%20 = load float, ptr %19, align 4
25+
26+
%v_100 = insertelement <2 x float> undef, float %18, i32 0
27+
%v_102 = insertelement <2 x float> %v_100, float %20, i32 1
28+
29+
%21 = getelementptr inbounds i8, ptr %17, i64 8
30+
%22 = load float, ptr %21, align 8
31+
%23 = getelementptr inbounds i8, ptr %17, i64 12
32+
%24 = load float, ptr %23, align 4
33+
34+
%v_200 = insertelement <2 x float> undef, float %22, i32 0
35+
%v_202 = insertelement <2 x float> %v_200, float %24, i32 1
36+
37+
%25 = getelementptr float, ptr addrspace(1) %1, i64 %15
38+
%26 = addrspacecast ptr addrspace(1) %25 to ptr
39+
%27 = sext i32 %12 to i64
40+
%28 = getelementptr float, ptr addrspace(1) %2, i64 %27
41+
%29 = sext i32 %13 to i64
42+
%30 = getelementptr float, ptr addrspace(1) %2, i64 %29
43+
%31 = sext i32 %14 to i64
44+
%32 = getelementptr float, ptr addrspace(1) %2, i64 %31
45+
%33 = getelementptr inbounds i8, ptr %26, i64 12
46+
%34 = load float, ptr %33, align 4
47+
48+
%36 = getelementptr inbounds i8, ptr %26, i64 8
49+
%37 = load float, ptr %36, align 8
50+
51+
%v_300 = insertelement <2 x float> undef, float %34, i32 0
52+
%v_302 = insertelement <2 x float> %v_300, float %37, i32 1
53+
54+
%39 = getelementptr inbounds i8, ptr %26, i64 4
55+
%40 = load float, ptr %39, align 4
56+
%42 = load float, ptr %26, align 16
57+
58+
%v_400 = insertelement <2 x float> undef, float %40, i32 0
59+
%v_402 = insertelement <2 x float> %v_400, float %42, i32 1
60+
61+
%v_500 = fadd <2 x float> %v_102, %v_402
62+
%v_501 = fadd <2 x float> %v_202, %v_302
63+
; tail call void @llvm.amdgcn.iglp.opt(i32 4)
64+
65+
%v_45 = extractelement <2 x float> %v_501, i32 1
66+
%v_32 = extractelement <2 x float> %v_501, i32 0
67+
%v_30 = extractelement <2 x float> %v_500, i32 1
68+
%v_28 = extractelement <2 x float> %v_500, i32 0
69+
70+
%i_44 = sext i32 %10 to i64
71+
%p_45 = getelementptr float, ptr addrspace(1) %2, i64 %i_44
72+
store float %v_28, ptr addrspace(1) %p_45, align 4
73+
74+
%i_31 = sext i32 %14 to i64
75+
%p_32 = getelementptr float, ptr addrspace(1) %2, i64 %i_31
76+
store float %v_32, ptr addrspace(1) %p_32, align 4
77+
78+
%i_29 = sext i32 %13 to i64
79+
%p_30 = getelementptr float, ptr addrspace(1) %2, i64 %i_29
80+
store float %v_30, ptr addrspace(1) %p_30, align 4
81+
82+
%i_27 = sext i32 %12 to i64
83+
%p_28 = getelementptr float, ptr addrspace(1) %2, i64 %i_27
84+
store float %v_28, ptr addrspace(1) %p_28, align 4
85+
86+
br label %.critedge2
87+
88+
.critedge2: ; preds = %4, %.critedge
89+
ret void
90+
}

0 commit comments

Comments
 (0)