diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 368e1a646c83b..76c49336aabca 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -3363,9 +3363,9 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, // Already computed the OpcodeOffset table, just index into it. if (N.getOpcode() < OpcodeOffset.size()) MatcherIndex = OpcodeOffset[N.getOpcode()]; - if (N->getOpcode() == ISD::FADD) { - MatcherIndex = 0; - } + if (N->getOpcode() == ISD::FADD) { + MatcherIndex = 0; + } LLVM_DEBUG(dbgs() << " Initial Opcode index to " << MatcherIndex << "\n"); } else if (MatcherTable[0] == OPC_SwitchOpcode) { // Otherwise, the table isn't computed, but the state machine does start diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 75becd6122fb9..ffa9f4f947e54 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -793,7 +793,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { } } - if (N->getOpcode() == ISD::FADD && false) { + if (N->getOpcode() == ISD::FADD) { llvm::dbgs() << "N->dump()\n"; N->dump(); MySelectCode(N); @@ -1717,7 +1717,7 @@ bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode, return true; } -bool AMDGPUDAGToDAGISel::MaxsComplexPatternPackedFP(SDNode *N) const { +bool AMDGPUDAGToDAGISel::MaxsComplexPatternPackedFP(SDNode *N, SelectionDAG *CurDAG) const { N->dump(); return false; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index ab581586c90a3..ea72d04eb8d7d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -277,7 +277,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { void SelectWAVE_ADDRESS(SDNode *N); void SelectSTACKRESTORE(SDNode *N); - bool MaxsComplexPatternPackedFP(SDNode *N) const; + bool MaxsComplexPatternPackedFP(SDNode *N, SelectionDAG *CurDAG) const; protected: // Include the pieces autogenerated from the target description. @@ -290,7 +290,7 @@ void MySelectCode(SDNode *N) { #define TARGET_VAL(X) X & 255, unsigned(X) >> 8 #define COVERAGE_IDX_VAL(X) X & 255, (unsigned(X) >> 8) & 255, (unsigned(X) >> 16) & 255, (unsigned(X) >> 24) & 255 static const unsigned char MatcherTable[] = { -// /*528829*/ /*SwitchOpcode*/ 118|128,4/*630*/, TARGET_VAL(ISD::FADD),// ->529463 +// /*528829*/ /*SwitchOpcode*/ 120|128,4/*632*/, TARGET_VAL(ISD::FADD),// ->529465 /*528833*/ OPC_Scope, 76, /*->528911*/ // 4 children in Scope /*528835*/ OPC_MoveChild0, /*528836*/ OPC_CheckOpcode, TARGET_VAL(ISD::INTRINSIC_WO_CHAIN), @@ -393,7 +393,7 @@ void MySelectCode(SDNode *N) { // Src: (fadd:{ *:[f32] } (AMDGPUfmul_legacy_impl:{ *:[f32] } (VOP3Mods:{ *:[f32] } f32:{ *:[f32] }:$src0, i32:{ *:[i32] }:$src0_mod), (VOP3Mods:{ *:[f32] } f32:{ *:[f32] }:$src1, i32:{ *:[i32] }:$src1_mod)), (VOP3Mods:{ *:[f32] } f32:{ *:[f32] }:$src2, i32:{ *:[i32] }:$src2_mod)) - Complexity = 33 // Dst: (V_MAD_LEGACY_F32_e64:{ *:[f32] } ?:{ *:[i32] }:$src0_mod, ?:{ *:[f32] }:$src0, ?:{ *:[i32] }:$src1_mod, ?:{ *:[f32] }:$src1, ?:{ *:[i32] }:$src2_mod, ?:{ *:[f32] }:$src2, 0:{ *:[i1] }, 0:{ *:[i32] }) /*529063*/ 0, /*End of Scope*/ -/*529064*/ /*Scope*/ 12|128,3/*396*/, /*->529462*/ +/*529064*/ /*Scope*/ 14|128,3/*398*/, /*->529464*/ /*529066*/ OPC_RecordChild0, // #0 = $VOP3NoMods:src2 /*529067*/ OPC_Scope, 74, /*->529143*/ // 2 children in Scope /*529069*/ OPC_MoveChild1, @@ -428,7 +428,7 @@ void MySelectCode(SDNode *N) { // Src: (fadd:{ *:[f32] } (VOP3Mods:{ *:[f32] } f32:{ *:[f32] }:$src2, i32:{ *:[i32] }:$src2_mod), (AMDGPUfmul_legacy_impl:{ *:[f32] } (VOP3Mods:{ *:[f32] } f32:{ *:[f32] }:$src0, i32:{ *:[i32] }:$src0_mod), (VOP3Mods:{ *:[f32] } f32:{ *:[f32] }:$src1, i32:{ *:[i32] }:$src1_mod))) - Complexity = 33 // Dst: (V_MAD_LEGACY_F32_e64:{ *:[f32] } ?:{ *:[i32] }:$src0_mod, ?:{ *:[f32] }:$src0, ?:{ *:[i32] }:$src1_mod, ?:{ *:[f32] }:$src1, ?:{ *:[i32] }:$src2_mod, ?:{ *:[f32] }:$src2, 0:{ *:[i1] }, 0:{ *:[i32] }) /*529142*/ 0, /*End of Scope*/ -/*529143*/ /*Scope*/ 60|128,2/*316*/, /*->529461*/ +/*529143*/ /*Scope*/ 62|128,2/*318*/, /*->529463*/ /*529145*/ OPC_RecordChild1, // #1 = $src1 /*529146*/ OPC_Scope, 25, /*->529173*/ // 6 children in Scope /*529148*/ OPC_CheckPredicate3, // Predicate_anonymous_13768 @@ -543,36 +543,37 @@ void MySelectCode(SDNode *N) { // Src: (fadd:{ *:[f64] } (VOP3Mods:{ *:[f64] } f64:{ *:[f64] }:$src1, i32:{ *:[i32] }:$src1_modifiers), (VOP3Mods0:{ *:[f64] } f64:{ *:[f64] }:$src0, i32:{ *:[i32] }:$src0_modifiers, i1:{ *:[i1] }:$clamp, i32:{ *:[i32] }:$omod)) - Complexity = -973 // Dst: (V_ADD_F64_e64:{ *:[f64] } i32:{ *:[i32] }:$src0_modifiers, f64:{ *:[f64] }:$src0, i32:{ *:[i32] }:$src1_modifiers, f64:{ *:[f64] }:$src1, i1:{ *:[i1] }:$clamp, i32:{ *:[i32] }:$omod) /*529389*/ 0, /*End of Scope*/ -/*529390*/ /*Scope*/ 33, /*->529424*/ -/*529391*/ OPC_CheckType, /*MVT::v2f16*/89, -/*529393*/ OPC_CheckComplexPat, /*CP*/13, /*#*/0, // SelectVOP3PMods:$ #2 #3 -/*529396*/ OPC_CheckComplexPat, /*CP*/13, /*#*/1, // SelectVOP3PMods:$ #4 #5 -/*529399*/ OPC_EmitInteger, /*MVT::i1*/2, 0, // 0 #6 -/*529402*/ OPC_EmitInteger32, 0, // 0 #7 -/*529404*/ OPC_EmitInteger32, 0, // 0 #8 -/*529406*/ OPC_EmitInteger32, 0, // 0 #9 -/*529408*/ OPC_EmitInteger32, 0, // 0 #10 -/*529410*/ OPC_MorphNodeTo1None, TARGET_VAL(AMDGPU::V_PK_ADD_F16), +/*529390*/ /*Scope*/ 37, /*->529428*/ +/*529391*/ OPC_CheckType, /*MVT::v2f32*/109, +/*529393*/ OPC_CheckPredicate, 108, // Predicate_my_any_fadd +/*529395*/ OPC_CheckPatternPredicate, 104, // (Subtarget->hasPackedFP32Ops()) +/*529397*/ OPC_CheckComplexPat, /*CP*/13, /*#*/0, // SelectVOP3PMods:$ #2 #3 +/*529400*/ OPC_CheckComplexPat, /*CP*/13, /*#*/1, // SelectVOP3PMods:$ #4 #5 +/*529403*/ OPC_EmitInteger, /*MVT::i1*/2, 0, // 0 #6 +/*529406*/ OPC_EmitInteger32, 0, // 0 #7 +/*529408*/ OPC_EmitInteger32, 0, // 0 #8 +/*529410*/ OPC_EmitInteger32, 0, // 0 #9 +/*529412*/ OPC_EmitInteger32, 0, // 0 #10 +/*529414*/ OPC_MorphNodeTo1None, TARGET_VAL(AMDGPU::V_PK_ADD_F32), + /*MVT::v2f32*/109, 9/*#Ops*/, 3, 2, 5, 4, 6, 7, 8, 9, 10, + // Src: (fadd:{ *:[v2f32] } (VOP3PMods:{ *:[v2f32] } v2f32:{ *:[v2f32] }:$src0, i32:{ *:[i32] }:$src0_modifiers), (VOP3PMods:{ *:[v2f32] } v2f32:{ *:[v2f32] }:$src1, i32:{ *:[i32] }:$src1_modifiers))<> - Complexity = -978 + // Dst: (V_PK_ADD_F32:{ *:[v2f32] } i32:{ *:[i32] }:$src0_modifiers, v2f32:{ *:[v2f32] }:$src0, i32:{ *:[i32] }:$src1_modifiers, v2f32:{ *:[v2f32] }:$src1) +/*529428*/ /*Scope*/ 33, /*->529462*/ +/*529429*/ OPC_CheckType, /*MVT::v2f16*/89, +/*529431*/ OPC_CheckComplexPat, /*CP*/13, /*#*/0, // SelectVOP3PMods:$ #2 #3 +/*529434*/ OPC_CheckComplexPat, /*CP*/13, /*#*/1, // SelectVOP3PMods:$ #4 #5 +/*529437*/ OPC_EmitInteger, /*MVT::i1*/2, 0, // 0 #6 +/*529440*/ OPC_EmitInteger32, 0, // 0 #7 +/*529442*/ OPC_EmitInteger32, 0, // 0 #8 +/*529444*/ OPC_EmitInteger32, 0, // 0 #9 +/*529446*/ OPC_EmitInteger32, 0, // 0 #10 +/*529448*/ OPC_MorphNodeTo1None, TARGET_VAL(AMDGPU::V_PK_ADD_F16), /*MVT::v2f16*/89, 9/*#Ops*/, 3, 2, 5, 4, 6, 7, 8, 9, 10, // Src: (fadd:{ *:[v2f16] } (VOP3PMods:{ *:[v2f16] } v2f16:{ *:[v2f16] }:$src0, i32:{ *:[i32] }:$src0_modifiers), (VOP3PMods:{ *:[v2f16] } v2f16:{ *:[v2f16] }:$src1, i32:{ *:[i32] }:$src1_modifiers)) - Complexity = -979 // Dst: (V_PK_ADD_F16:{ *:[v2f16] } i32:{ *:[i32] }:$src0_modifiers, v2f16:{ *:[v2f16] }:$src0, i32:{ *:[i32] }:$src1_modifiers, v2f16:{ *:[v2f16] }:$src1) -/*529424*/ /*Scope*/ 35, /*->529460*/ -/*529425*/ OPC_CheckType, /*MVT::v2f32*/109, -/*529427*/ OPC_CheckPatternPredicate, 104, // (Subtarget->hasPackedFP32Ops()) -/*529429*/ OPC_CheckComplexPat, /*CP*/13, /*#*/0, // SelectVOP3PMods:$ #2 #3 -/*529432*/ OPC_CheckComplexPat, /*CP*/13, /*#*/1, // SelectVOP3PMods:$ #4 #5 -/*529435*/ OPC_EmitInteger, /*MVT::i1*/2, 0, // 0 #6 -/*529438*/ OPC_EmitInteger32, 0, // 0 #7 -/*529440*/ OPC_EmitInteger32, 0, // 0 #8 -/*529442*/ OPC_EmitInteger32, 0, // 0 #9 -/*529444*/ OPC_EmitInteger32, 0, // 0 #10 -/*529446*/ OPC_MorphNodeTo1None, TARGET_VAL(AMDGPU::V_PK_ADD_F32), - /*MVT::v2f32*/109, 9/*#Ops*/, 3, 2, 5, 4, 6, 7, 8, 9, 10, - // Src: (fadd:{ *:[v2f32] } (VOP3PMods:{ *:[v2f32] } v2f32:{ *:[v2f32] }:$src0, i32:{ *:[i32] }:$src0_modifiers), (VOP3PMods:{ *:[v2f32] } v2f32:{ *:[v2f32] }:$src1, i32:{ *:[i32] }:$src1_modifiers)) - Complexity = -979 - // Dst: (V_PK_ADD_F32:{ *:[v2f32] } i32:{ *:[i32] }:$src0_modifiers, v2f32:{ *:[v2f32] }:$src0, i32:{ *:[i32] }:$src1_modifiers, v2f32:{ *:[v2f32] }:$src1) -/*529460*/ 0, /*End of Scope*/ -/*529461*/ 0, /*End of Scope*/ -/*529462*/ 0, /*End of Scope*/ +/*529462*/ 0, /*End of Scope*/ +/*529463*/ 0, /*End of Scope*/ +/*529464*/ 0, /*End of Scope*/ }; // Total Array size is 563301 bytes SelectCodeCommon(N, MatcherTable, sizeof(MatcherTable)); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index caf115490dccc..a319de43d23e7 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1143,8 +1143,9 @@ struct MaxsUnpackPackedF32OpsDAGMutation : ScheduleDAGMutation { const TargetInstrInfo &TII = *DAG->TII; const GCNSubtarget &ST = DAG->MF.getSubtarget(); for (auto &I : *DAG) { - if (I.getOpcode() == AMDGPU::V_PK_ADD_F32) + if (I.getOpcode() == AMDGPU::V_PK_ADD_F32) { I.dump(); + } I.dump(); } llvm::dbgs() << "Completed MaxsUnpackPackedF32OpsDAGMutation\n"; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 5768f27a2a263..88fb90eab8118 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1144,9 +1144,13 @@ def MAIInstInfoTable : GenericTable { let PrimaryKeyName = "getMAIInstInfoHelper"; } +// this still doesn't work because +// SubtargetPredicate = HasPackedFP32Ops +// doesn't just determine selection it determines type legalization etc too +// (general spaghetti) def my_any_fadd : PatFrags<(ops node:$lhs, node:$rhs), [(strict_fadd node:$lhs, node:$rhs), (fadd node:$lhs, node:$rhs)]> { - let PredicateCode = [{ return MaxsComplexPatternPackedFP(N); }]; + let PredicateCode = [{ return MaxsComplexPatternPackedFP(N, CurDAG); }]; } let isCommutable = 1, isReMaterializable = 1 in { diff --git a/llvm/lib/Target/AMDGPU/test_v_pk.ll b/llvm/lib/Target/AMDGPU/test_v_pk.ll new file mode 100644 index 0000000000000..19d7ad418f187 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/test_v_pk.ll @@ -0,0 +1,90 @@ +; /home/mlevental/dev_projects/llvm-project/llvm/lib/Target/AMDGPU/test_v_pk.ll -mtriple=amdgcn -mcpu=gfx942 -o - +; /home/mlevental/dev_projects/llvm-project/llvm/lib/Target/AMDGPU/test_v_pk.ll -mattr=-packed-fp32-ops -mtriple=amdgcn -mcpu=gfx942 -o - + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define amdgpu_kernel void @add_kernel(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture readonly %1, ptr addrspace(1) nocapture writeonly %2, i32 %3) local_unnamed_addr #0 { + %5 = tail call i32 @llvm.amdgcn.workgroup.id.x() + %6 = shl i32 %5, 10 + %7 = tail call i32 @llvm.amdgcn.workitem.id.x() + %8 = shl i32 %7, 2 + %9 = and i32 %8, 1020 + %10 = or disjoint i32 %9, %6 + %11 = icmp slt i32 %10, %3 + br i1 %11, label %.critedge, label %.critedge2 + +.critedge: ; preds = %4 + %12 = or disjoint i32 %10, 3 + %13 = or disjoint i32 %10, 2 + %14 = or disjoint i32 %10, 1 + %15 = sext i32 %10 to i64 + %16 = getelementptr float, ptr addrspace(1) %0, i64 %15 + %17 = addrspacecast ptr addrspace(1) %16 to ptr + %18 = load float, ptr %17, align 16 + %19 = getelementptr inbounds i8, ptr %17, i64 4 + %20 = load float, ptr %19, align 4 + + %v_100 = insertelement <2 x float> undef, float %18, i32 0 + %v_102 = insertelement <2 x float> %v_100, float %20, i32 1 + + %21 = getelementptr inbounds i8, ptr %17, i64 8 + %22 = load float, ptr %21, align 8 + %23 = getelementptr inbounds i8, ptr %17, i64 12 + %24 = load float, ptr %23, align 4 + + %v_200 = insertelement <2 x float> undef, float %22, i32 0 + %v_202 = insertelement <2 x float> %v_200, float %24, i32 1 + + %25 = getelementptr float, ptr addrspace(1) %1, i64 %15 + %26 = addrspacecast ptr addrspace(1) %25 to ptr + %27 = sext i32 %12 to i64 + %28 = getelementptr float, ptr addrspace(1) %2, i64 %27 + %29 = sext i32 %13 to i64 + %30 = getelementptr float, ptr addrspace(1) %2, i64 %29 + %31 = sext i32 %14 to i64 + %32 = getelementptr float, ptr addrspace(1) %2, i64 %31 + %33 = getelementptr inbounds i8, ptr %26, i64 12 + %34 = load float, ptr %33, align 4 + + %36 = getelementptr inbounds i8, ptr %26, i64 8 + %37 = load float, ptr %36, align 8 + + %v_300 = insertelement <2 x float> undef, float %34, i32 0 + %v_302 = insertelement <2 x float> %v_300, float %37, i32 1 + + %39 = getelementptr inbounds i8, ptr %26, i64 4 + %40 = load float, ptr %39, align 4 + %42 = load float, ptr %26, align 16 + + %v_400 = insertelement <2 x float> undef, float %40, i32 0 + %v_402 = insertelement <2 x float> %v_400, float %42, i32 1 + + %v_500 = fadd <2 x float> %v_102, %v_402 + ; %v_501 = fadd <2 x float> %v_202, %v_302 + ; tail call void @llvm.amdgcn.iglp.opt(i32 4) + + ; %v_45 = extractelement <2 x float> %v_501, i32 1 + ; %v_32 = extractelement <2 x float> %v_501, i32 0 + %v_30 = extractelement <2 x float> %v_500, i32 1 + %v_28 = extractelement <2 x float> %v_500, i32 0 + + %i_44 = sext i32 %10 to i64 + %p_45 = getelementptr float, ptr addrspace(1) %2, i64 %i_44 + store float %v_28, ptr addrspace(1) %p_45, align 4 + + ; %i_31 = sext i32 %14 to i64 + ; %p_32 = getelementptr float, ptr addrspace(1) %2, i64 %i_31 + ; store float %v_32, ptr addrspace(1) %p_32, align 4 + + %i_29 = sext i32 %13 to i64 + %p_30 = getelementptr float, ptr addrspace(1) %2, i64 %i_29 + store float %v_30, ptr addrspace(1) %p_30, align 4 + + %i_27 = sext i32 %12 to i64 + %p_28 = getelementptr float, ptr addrspace(1) %2, i64 %i_27 + store float %v_28, ptr addrspace(1) %p_28, align 4 + + br label %.critedge2 + +.critedge2: ; preds = %4, %.critedge + ret void +}