Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 23 additions & 15 deletions llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -916,21 +916,29 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
if (SIInstrInfo::isSDWA(MI)) {
// Type 1: SDWA with dst_sel != DWORD
if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
return nullptr;
} else {
// Type 2 && Type 3: (VOP3 which write the hi bits) || (FP8DstSelInst
// with op_sel[3:2] != 0)
if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) ||
!(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
SISrcMods::DST_OP_SEL ||
(AMDGPU::isFP8DstSelInst(Opcode) &&
(TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
SISrcMods::OP_SEL_0))))
return nullptr;
}

return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
}

if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) {
// Type 2: VOP3 which write the hi bits
if (TII->getNamedImmOperand(MI, AMDGPU::OpName::src0_modifiers) &
SISrcMods::DST_OP_SEL)
return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);

// Type 3: FP8DstSelInst with op_sel[3:2] != 0)
if (AMDGPU::isFP8DstSelInst(Opcode) &&
(TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) &
SISrcMods::OP_SEL_0))
return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
}

// Special case: nop is required for all the opsel values for fp4 sr variant
// cvt scale instructions
if (AMDGPU::isFP4DstSelInst(Opcode))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you combine the fp4 and fp8 queries?

return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);

return nullptr;
}

/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2567,6 +2567,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field bit IsFP8SrcByteSel = 0;
field bit IsFP8DstByteSel = 0;
field bit HasFP8DstByteSel = 0;
field bit HasFP4DstByteSel = 0;
field bit IsFP8ByteSel = !or(IsFP8SrcByteSel, IsFP8DstByteSel);

field bit HasDst = !ne(DstVT.Value, untyped.Value);
Expand Down Expand Up @@ -3258,6 +3259,15 @@ def FP8DstByteSelTable : GenericTable {
let PrimaryKeyName = "getFP8DstByteSelHelper";
}

def FP4DstByteSelTable : GenericTable {
let FilterClass = "VOP3_Pseudo";
let CppTypeName = "FP4DstByteSelInfo";
let Fields = ["Opcode", "HasFP4DstByteSel"];

let PrimaryKey = ["Opcode"];
let PrimaryKeyName = "getFP4DstByteSelHelper";
}

def VOPDComponentTable : GenericTable {
let FilterClass = "VOPD_Component";
let CppTypeName = "VOPDComponentInfo";
Expand Down
12 changes: 12 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,8 @@ struct VOPTrue16Info {

#define GET_FP8DstByteSelTable_DECL
#define GET_FP8DstByteSelTable_IMPL
#define GET_FP4DstByteSelTable_DECL
#define GET_FP4DstByteSelTable_IMPL

struct DPMACCInstructionInfo {
uint16_t Opcode;
Expand All @@ -391,6 +393,11 @@ struct FP8DstByteSelInfo {
bool HasFP8DstByteSel;
};

struct FP4DstByteSelInfo {
uint16_t Opcode;
bool HasFP4DstByteSel;
};

#define GET_FP8DstByteSelTable_DECL
#define GET_FP8DstByteSelTable_IMPL
#define GET_MTBUFInfoTable_DECL
Expand Down Expand Up @@ -662,6 +669,11 @@ bool isFP8DstSelInst(unsigned Opc) {
return Info ? Info->HasFP8DstByteSel : false;
}

bool isFP4DstSelInst(unsigned Opc) {
const FP4DstByteSelInfo *Info = getFP4DstByteSelHelper(Opc);
return Info ? Info->HasFP4DstByteSel : false;
}

unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) {
const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc);
return Info ? Info->Opcode3Addr : ~0u;
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -887,6 +887,9 @@ bool isTrue16Inst(unsigned Opc);
LLVM_READONLY
bool isFP8DstSelInst(unsigned Opc);

LLVM_READONLY
bool isFP4DstSelInst(unsigned Opc);

LLVM_READONLY
bool isInvalidSingleUseConsumerInst(unsigned Opc);

Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1014,7 +1014,7 @@ class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
let HasExtVOP3DPP = 0;
let HasOpSel = 1;
let HasOMod = 0;
let HasFP8DstByteSel = 1;
let HasFP4DstByteSel = 1;
}

def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32, v2f32, i32, f32]>, VOP3_OPSEL> {
Expand All @@ -1026,7 +1026,7 @@ def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32
let HasExtVOP3DPP = 0;
let HasOpSel = 1;
let HasOMod = 0;
let HasFP8DstByteSel = 1;
let HasFP4DstByteSel = 1;
}

class VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<ValueType DstTy> : VOP3_Profile<VOPProfile<[DstTy, i32, f32, untyped]>,
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/VOPInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
let IsSWMMAC = P.IsSWMMAC;

bit HasFP8DstByteSel = P.HasFP8DstByteSel;
bit HasFP4DstByteSel = P.HasFP4DstByteSel;

let AsmOperands = !if(!and(!not(P.IsTrue16), isVop3OpSel),
P.AsmVOP3OpSel,
Expand Down
15 changes: 9 additions & 6 deletions llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
Original file line number Diff line number Diff line change
Expand Up @@ -642,17 +642,18 @@ body: |
...

---
name: test_scalef32_sr_pk_fp4_bf16_neg_opsel0_hazard
name: test_scalef32_sr_pk_fp4_bf16_opsel0_hazard
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
; GCN-LABEL: name: test_scalef32_sr_pk_fp4_bf16_neg_opsel0_hazard
; GCN-LABEL: name: test_scalef32_sr_pk_fp4_bf16_opsel0_hazard
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
; GCN-NEXT: {{ $}}
; GCN-NEXT: S_WAITCNT 0
; GCN-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
; GCN-NEXT: S_WAITCNT 3952
; GCN-NEXT: early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_BF16_e64 0, killed $vgpr2, 0, killed $vgpr3, 0, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec
; GCN-NEXT: S_NOP 0
; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
S_WAITCNT 0
Expand Down Expand Up @@ -731,17 +732,18 @@ body: |
...

---
name: test_scalef32_sr_pk_fp4_f32_neg_opsel0_hazard
name: test_scalef32_sr_pk_fp4_f32_opsel0_hazard
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why rename here?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change adds nop for opsel==0 as well now. So its not negative test anymore.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was never a negative test though, it was pre-commited for this

body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GCN-LABEL: name: test_scalef32_sr_pk_fp4_f32_neg_opsel0_hazard
; GCN-LABEL: name: test_scalef32_sr_pk_fp4_f32_opsel0_hazard
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GCN-NEXT: {{ $}}
; GCN-NEXT: S_WAITCNT 0
; GCN-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
; GCN-NEXT: S_WAITCNT 3952
; GCN-NEXT: early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_F32_e64 0, killed $vgpr2_vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, killed $vgpr0, 0, implicit $mode, implicit $exec
; GCN-NEXT: S_NOP 0
; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
S_WAITCNT 0
Expand Down Expand Up @@ -1119,17 +1121,18 @@ body: |
...

---
name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_neg_opsel0_hazard
name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_opsel0_hazard
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GCN-LABEL: name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_neg_opsel0_hazard
; GCN-LABEL: name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_opsel0_hazard
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GCN-NEXT: {{ $}}
; GCN-NEXT: S_WAITCNT 0
; GCN-NEXT: renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, $vgpr0, 0, $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec
; GCN-NEXT: S_NOP 0
; GCN-NEXT: early-clobber renamable $vgpr4 = V_CVT_SCALEF32_SR_PK_FP4_F16_e64 0, killed $vgpr0, 0, killed $vgpr3, 0, killed $vgpr1, killed $vgpr2, 0, implicit $mode, implicit $exec
; GCN-NEXT: S_NOP 0
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
S_WAITCNT 0
Expand Down
Loading