Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3363,9 +3363,9 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
// Already computed the OpcodeOffset table, just index into it.
if (N.getOpcode() < OpcodeOffset.size())
MatcherIndex = OpcodeOffset[N.getOpcode()];
if (N->getOpcode() == ISD::FADD) {
MatcherIndex = 0;
}
if (N->getOpcode() == ISD::FADD) {
MatcherIndex = 0;
}
LLVM_DEBUG(dbgs() << " Initial Opcode index to " << MatcherIndex << "\n");
} else if (MatcherTable[0] == OPC_SwitchOpcode) {
// Otherwise, the table isn't computed, but the state machine does start
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -793,7 +793,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
}
}

if (N->getOpcode() == ISD::FADD && false) {
if (N->getOpcode() == ISD::FADD) {
llvm::dbgs() << "N->dump()\n";
N->dump();
MySelectCode(N);
Expand Down Expand Up @@ -1717,7 +1717,7 @@ bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
return true;
}

bool AMDGPUDAGToDAGISel::MaxsComplexPatternPackedFP(SDNode *N) const {
bool AMDGPUDAGToDAGISel::MaxsComplexPatternPackedFP(SDNode *N, SelectionDAG *CurDAG) const {
N->dump();
return false;
}
Expand Down
63 changes: 32 additions & 31 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
void SelectWAVE_ADDRESS(SDNode *N);
void SelectSTACKRESTORE(SDNode *N);

bool MaxsComplexPatternPackedFP(SDNode *N) const;
bool MaxsComplexPatternPackedFP(SDNode *N, SelectionDAG *CurDAG) const;

protected:
// Include the pieces autogenerated from the target description.
Expand All @@ -290,7 +290,7 @@ void MySelectCode(SDNode *N) {
#define TARGET_VAL(X) X & 255, unsigned(X) >> 8
#define COVERAGE_IDX_VAL(X) X & 255, (unsigned(X) >> 8) & 255, (unsigned(X) >> 16) & 255, (unsigned(X) >> 24) & 255
static const unsigned char MatcherTable[] = {
// /*528829*/ /*SwitchOpcode*/ 118|128,4/*630*/, TARGET_VAL(ISD::FADD),// ->529463
// /*528829*/ /*SwitchOpcode*/ 120|128,4/*632*/, TARGET_VAL(ISD::FADD),// ->529465
/*528833*/ OPC_Scope, 76, /*->528911*/ // 4 children in Scope
/*528835*/ OPC_MoveChild0,
/*528836*/ OPC_CheckOpcode, TARGET_VAL(ISD::INTRINSIC_WO_CHAIN),
Expand Down Expand Up @@ -393,7 +393,7 @@ void MySelectCode(SDNode *N) {
// Src: (fadd:{ *:[f32] } (AMDGPUfmul_legacy_impl:{ *:[f32] } (VOP3Mods:{ *:[f32] } f32:{ *:[f32] }:$src0, i32:{ *:[i32] }:$src0_mod), (VOP3Mods:{ *:[f32] } f32:{ *:[f32] }:$src1, i32:{ *:[i32] }:$src1_mod)), (VOP3Mods:{ *:[f32] } f32:{ *:[f32] }:$src2, i32:{ *:[i32] }:$src2_mod)) - Complexity = 33
// Dst: (V_MAD_LEGACY_F32_e64:{ *:[f32] } ?:{ *:[i32] }:$src0_mod, ?:{ *:[f32] }:$src0, ?:{ *:[i32] }:$src1_mod, ?:{ *:[f32] }:$src1, ?:{ *:[i32] }:$src2_mod, ?:{ *:[f32] }:$src2, 0:{ *:[i1] }, 0:{ *:[i32] })
/*529063*/ 0, /*End of Scope*/
/*529064*/ /*Scope*/ 12|128,3/*396*/, /*->529462*/
/*529064*/ /*Scope*/ 14|128,3/*398*/, /*->529464*/
/*529066*/ OPC_RecordChild0, // #0 = $VOP3NoMods:src2
/*529067*/ OPC_Scope, 74, /*->529143*/ // 2 children in Scope
/*529069*/ OPC_MoveChild1,
Expand Down Expand Up @@ -428,7 +428,7 @@ void MySelectCode(SDNode *N) {
// Src: (fadd:{ *:[f32] } (VOP3Mods:{ *:[f32] } f32:{ *:[f32] }:$src2, i32:{ *:[i32] }:$src2_mod), (AMDGPUfmul_legacy_impl:{ *:[f32] } (VOP3Mods:{ *:[f32] } f32:{ *:[f32] }:$src0, i32:{ *:[i32] }:$src0_mod), (VOP3Mods:{ *:[f32] } f32:{ *:[f32] }:$src1, i32:{ *:[i32] }:$src1_mod))) - Complexity = 33
// Dst: (V_MAD_LEGACY_F32_e64:{ *:[f32] } ?:{ *:[i32] }:$src0_mod, ?:{ *:[f32] }:$src0, ?:{ *:[i32] }:$src1_mod, ?:{ *:[f32] }:$src1, ?:{ *:[i32] }:$src2_mod, ?:{ *:[f32] }:$src2, 0:{ *:[i1] }, 0:{ *:[i32] })
/*529142*/ 0, /*End of Scope*/
/*529143*/ /*Scope*/ 60|128,2/*316*/, /*->529461*/
/*529143*/ /*Scope*/ 62|128,2/*318*/, /*->529463*/
/*529145*/ OPC_RecordChild1, // #1 = $src1
/*529146*/ OPC_Scope, 25, /*->529173*/ // 6 children in Scope
/*529148*/ OPC_CheckPredicate3, // Predicate_anonymous_13768
Expand Down Expand Up @@ -543,36 +543,37 @@ void MySelectCode(SDNode *N) {
// Src: (fadd:{ *:[f64] } (VOP3Mods:{ *:[f64] } f64:{ *:[f64] }:$src1, i32:{ *:[i32] }:$src1_modifiers), (VOP3Mods0:{ *:[f64] } f64:{ *:[f64] }:$src0, i32:{ *:[i32] }:$src0_modifiers, i1:{ *:[i1] }:$clamp, i32:{ *:[i32] }:$omod)) - Complexity = -973
// Dst: (V_ADD_F64_e64:{ *:[f64] } i32:{ *:[i32] }:$src0_modifiers, f64:{ *:[f64] }:$src0, i32:{ *:[i32] }:$src1_modifiers, f64:{ *:[f64] }:$src1, i1:{ *:[i1] }:$clamp, i32:{ *:[i32] }:$omod)
/*529389*/ 0, /*End of Scope*/
/*529390*/ /*Scope*/ 33, /*->529424*/
/*529391*/ OPC_CheckType, /*MVT::v2f16*/89,
/*529393*/ OPC_CheckComplexPat, /*CP*/13, /*#*/0, // SelectVOP3PMods:$ #2 #3
/*529396*/ OPC_CheckComplexPat, /*CP*/13, /*#*/1, // SelectVOP3PMods:$ #4 #5
/*529399*/ OPC_EmitInteger, /*MVT::i1*/2, 0, // 0 #6
/*529402*/ OPC_EmitInteger32, 0, // 0 #7
/*529404*/ OPC_EmitInteger32, 0, // 0 #8
/*529406*/ OPC_EmitInteger32, 0, // 0 #9
/*529408*/ OPC_EmitInteger32, 0, // 0 #10
/*529410*/ OPC_MorphNodeTo1None, TARGET_VAL(AMDGPU::V_PK_ADD_F16),
/*529390*/ /*Scope*/ 37, /*->529428*/
/*529391*/ OPC_CheckType, /*MVT::v2f32*/109,
/*529393*/ OPC_CheckPredicate, 108, // Predicate_my_any_fadd
/*529395*/ OPC_CheckPatternPredicate, 104, // (Subtarget->hasPackedFP32Ops())
/*529397*/ OPC_CheckComplexPat, /*CP*/13, /*#*/0, // SelectVOP3PMods:$ #2 #3
/*529400*/ OPC_CheckComplexPat, /*CP*/13, /*#*/1, // SelectVOP3PMods:$ #4 #5
/*529403*/ OPC_EmitInteger, /*MVT::i1*/2, 0, // 0 #6
/*529406*/ OPC_EmitInteger32, 0, // 0 #7
/*529408*/ OPC_EmitInteger32, 0, // 0 #8
/*529410*/ OPC_EmitInteger32, 0, // 0 #9
/*529412*/ OPC_EmitInteger32, 0, // 0 #10
/*529414*/ OPC_MorphNodeTo1None, TARGET_VAL(AMDGPU::V_PK_ADD_F32),
/*MVT::v2f32*/109, 9/*#Ops*/, 3, 2, 5, 4, 6, 7, 8, 9, 10,
// Src: (fadd:{ *:[v2f32] } (VOP3PMods:{ *:[v2f32] } v2f32:{ *:[v2f32] }:$src0, i32:{ *:[i32] }:$src0_modifiers), (VOP3PMods:{ *:[v2f32] } v2f32:{ *:[v2f32] }:$src1, i32:{ *:[i32] }:$src1_modifiers))<<P:Predicate_my_any_fadd>> - Complexity = -978
// Dst: (V_PK_ADD_F32:{ *:[v2f32] } i32:{ *:[i32] }:$src0_modifiers, v2f32:{ *:[v2f32] }:$src0, i32:{ *:[i32] }:$src1_modifiers, v2f32:{ *:[v2f32] }:$src1)
/*529428*/ /*Scope*/ 33, /*->529462*/
/*529429*/ OPC_CheckType, /*MVT::v2f16*/89,
/*529431*/ OPC_CheckComplexPat, /*CP*/13, /*#*/0, // SelectVOP3PMods:$ #2 #3
/*529434*/ OPC_CheckComplexPat, /*CP*/13, /*#*/1, // SelectVOP3PMods:$ #4 #5
/*529437*/ OPC_EmitInteger, /*MVT::i1*/2, 0, // 0 #6
/*529440*/ OPC_EmitInteger32, 0, // 0 #7
/*529442*/ OPC_EmitInteger32, 0, // 0 #8
/*529444*/ OPC_EmitInteger32, 0, // 0 #9
/*529446*/ OPC_EmitInteger32, 0, // 0 #10
/*529448*/ OPC_MorphNodeTo1None, TARGET_VAL(AMDGPU::V_PK_ADD_F16),
/*MVT::v2f16*/89, 9/*#Ops*/, 3, 2, 5, 4, 6, 7, 8, 9, 10,
// Src: (fadd:{ *:[v2f16] } (VOP3PMods:{ *:[v2f16] } v2f16:{ *:[v2f16] }:$src0, i32:{ *:[i32] }:$src0_modifiers), (VOP3PMods:{ *:[v2f16] } v2f16:{ *:[v2f16] }:$src1, i32:{ *:[i32] }:$src1_modifiers)) - Complexity = -979
// Dst: (V_PK_ADD_F16:{ *:[v2f16] } i32:{ *:[i32] }:$src0_modifiers, v2f16:{ *:[v2f16] }:$src0, i32:{ *:[i32] }:$src1_modifiers, v2f16:{ *:[v2f16] }:$src1)
/*529424*/ /*Scope*/ 35, /*->529460*/
/*529425*/ OPC_CheckType, /*MVT::v2f32*/109,
/*529427*/ OPC_CheckPatternPredicate, 104, // (Subtarget->hasPackedFP32Ops())
/*529429*/ OPC_CheckComplexPat, /*CP*/13, /*#*/0, // SelectVOP3PMods:$ #2 #3
/*529432*/ OPC_CheckComplexPat, /*CP*/13, /*#*/1, // SelectVOP3PMods:$ #4 #5
/*529435*/ OPC_EmitInteger, /*MVT::i1*/2, 0, // 0 #6
/*529438*/ OPC_EmitInteger32, 0, // 0 #7
/*529440*/ OPC_EmitInteger32, 0, // 0 #8
/*529442*/ OPC_EmitInteger32, 0, // 0 #9
/*529444*/ OPC_EmitInteger32, 0, // 0 #10
/*529446*/ OPC_MorphNodeTo1None, TARGET_VAL(AMDGPU::V_PK_ADD_F32),
/*MVT::v2f32*/109, 9/*#Ops*/, 3, 2, 5, 4, 6, 7, 8, 9, 10,
// Src: (fadd:{ *:[v2f32] } (VOP3PMods:{ *:[v2f32] } v2f32:{ *:[v2f32] }:$src0, i32:{ *:[i32] }:$src0_modifiers), (VOP3PMods:{ *:[v2f32] } v2f32:{ *:[v2f32] }:$src1, i32:{ *:[i32] }:$src1_modifiers)) - Complexity = -979
// Dst: (V_PK_ADD_F32:{ *:[v2f32] } i32:{ *:[i32] }:$src0_modifiers, v2f32:{ *:[v2f32] }:$src0, i32:{ *:[i32] }:$src1_modifiers, v2f32:{ *:[v2f32] }:$src1)
/*529460*/ 0, /*End of Scope*/
/*529461*/ 0, /*End of Scope*/
/*529462*/ 0, /*End of Scope*/
/*529462*/ 0, /*End of Scope*/
/*529463*/ 0, /*End of Scope*/
/*529464*/ 0, /*End of Scope*/
}; // Total Array size is 563301 bytes

SelectCodeCommon(N, MatcherTable, sizeof(MatcherTable));
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1143,8 +1143,9 @@ struct MaxsUnpackPackedF32OpsDAGMutation : ScheduleDAGMutation {
const TargetInstrInfo &TII = *DAG->TII;
const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
for (auto &I : *DAG) {
if (I.getOpcode() == AMDGPU::V_PK_ADD_F32)
if (I.getOpcode() == AMDGPU::V_PK_ADD_F32) {
I.dump();
}
I.dump();
}
llvm::dbgs() << "Completed MaxsUnpackPackedF32OpsDAGMutation\n";
Expand Down
6 changes: 5 additions & 1 deletion llvm/lib/Target/AMDGPU/VOP3PInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1144,9 +1144,13 @@ def MAIInstInfoTable : GenericTable {
let PrimaryKeyName = "getMAIInstInfoHelper";
}

// this still doesn't work because
// SubtargetPredicate = HasPackedFP32Ops
// doesn't just determine selection it determines type legalization etc too
// (general spaghetti)
def my_any_fadd : PatFrags<(ops node:$lhs, node:$rhs),
[(strict_fadd node:$lhs, node:$rhs), (fadd node:$lhs, node:$rhs)]> {
let PredicateCode = [{ return MaxsComplexPatternPackedFP(N); }];
let PredicateCode = [{ return MaxsComplexPatternPackedFP(N, CurDAG); }];
}

let isCommutable = 1, isReMaterializable = 1 in {
Expand Down
90 changes: 90 additions & 0 deletions llvm/lib/Target/AMDGPU/test_v_pk.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
; /home/mlevental/dev_projects/llvm-project/llvm/lib/Target/AMDGPU/test_v_pk.ll -mtriple=amdgcn -mcpu=gfx942 -o -
; /home/mlevental/dev_projects/llvm-project/llvm/lib/Target/AMDGPU/test_v_pk.ll -mattr=-packed-fp32-ops -mtriple=amdgcn -mcpu=gfx942 -o -

; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define amdgpu_kernel void @add_kernel(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture readonly %1, ptr addrspace(1) nocapture writeonly %2, i32 %3) local_unnamed_addr #0 {
%5 = tail call i32 @llvm.amdgcn.workgroup.id.x()
%6 = shl i32 %5, 10
%7 = tail call i32 @llvm.amdgcn.workitem.id.x()
%8 = shl i32 %7, 2
%9 = and i32 %8, 1020
%10 = or disjoint i32 %9, %6
%11 = icmp slt i32 %10, %3
br i1 %11, label %.critedge, label %.critedge2

.critedge: ; preds = %4
%12 = or disjoint i32 %10, 3
%13 = or disjoint i32 %10, 2
%14 = or disjoint i32 %10, 1
%15 = sext i32 %10 to i64
%16 = getelementptr float, ptr addrspace(1) %0, i64 %15
%17 = addrspacecast ptr addrspace(1) %16 to ptr
%18 = load float, ptr %17, align 16
%19 = getelementptr inbounds i8, ptr %17, i64 4
%20 = load float, ptr %19, align 4

%v_100 = insertelement <2 x float> undef, float %18, i32 0
%v_102 = insertelement <2 x float> %v_100, float %20, i32 1

%21 = getelementptr inbounds i8, ptr %17, i64 8
%22 = load float, ptr %21, align 8
%23 = getelementptr inbounds i8, ptr %17, i64 12
%24 = load float, ptr %23, align 4

%v_200 = insertelement <2 x float> undef, float %22, i32 0
%v_202 = insertelement <2 x float> %v_200, float %24, i32 1

%25 = getelementptr float, ptr addrspace(1) %1, i64 %15
%26 = addrspacecast ptr addrspace(1) %25 to ptr
%27 = sext i32 %12 to i64
%28 = getelementptr float, ptr addrspace(1) %2, i64 %27
%29 = sext i32 %13 to i64
%30 = getelementptr float, ptr addrspace(1) %2, i64 %29
%31 = sext i32 %14 to i64
%32 = getelementptr float, ptr addrspace(1) %2, i64 %31
%33 = getelementptr inbounds i8, ptr %26, i64 12
%34 = load float, ptr %33, align 4

%36 = getelementptr inbounds i8, ptr %26, i64 8
%37 = load float, ptr %36, align 8

%v_300 = insertelement <2 x float> undef, float %34, i32 0
%v_302 = insertelement <2 x float> %v_300, float %37, i32 1

%39 = getelementptr inbounds i8, ptr %26, i64 4
%40 = load float, ptr %39, align 4
%42 = load float, ptr %26, align 16

%v_400 = insertelement <2 x float> undef, float %40, i32 0
%v_402 = insertelement <2 x float> %v_400, float %42, i32 1

%v_500 = fadd <2 x float> %v_102, %v_402
; %v_501 = fadd <2 x float> %v_202, %v_302
; tail call void @llvm.amdgcn.iglp.opt(i32 4)

; %v_45 = extractelement <2 x float> %v_501, i32 1
; %v_32 = extractelement <2 x float> %v_501, i32 0
%v_30 = extractelement <2 x float> %v_500, i32 1
%v_28 = extractelement <2 x float> %v_500, i32 0

%i_44 = sext i32 %10 to i64
%p_45 = getelementptr float, ptr addrspace(1) %2, i64 %i_44
store float %v_28, ptr addrspace(1) %p_45, align 4

; %i_31 = sext i32 %14 to i64
; %p_32 = getelementptr float, ptr addrspace(1) %2, i64 %i_31
; store float %v_32, ptr addrspace(1) %p_32, align 4

%i_29 = sext i32 %13 to i64
%p_30 = getelementptr float, ptr addrspace(1) %2, i64 %i_29
store float %v_30, ptr addrspace(1) %p_30, align 4

%i_27 = sext i32 %12 to i64
%p_28 = getelementptr float, ptr addrspace(1) %2, i64 %i_27
store float %v_28, ptr addrspace(1) %p_28, align 4

br label %.critedge2

.critedge2: ; preds = %4, %.critedge
ret void
}