-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[AMDGPU]: Add support to unpack V_PK_MOV_B32 #163464
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 6 commits
dc0089e
8587628
27affd8
25b9c71
e2fbe0c
08fdc07
44b3744
75daff0
53009ee
4089361
6b43f6c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -59,7 +59,7 @@ class SIPreEmitPeephole { | |||||||||||
| // v_fma_f32 v1, v0, v2, v2 | ||||||||||||
| // Here, we have overwritten v0 before we use it. This function checks if | ||||||||||||
| // unpacking can lead to such a situation. | ||||||||||||
| bool canUnpackingClobberRegister(const MachineInstr &MI); | ||||||||||||
| bool canUnpackingClobberRegister(MachineInstr &MI); | ||||||||||||
| // Unpack and insert F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and | ||||||||||||
| // V_PK_FMA. Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for | ||||||||||||
| // this transformation. | ||||||||||||
|
|
@@ -451,7 +451,10 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI, | |||||||||||
| return true; | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) { | ||||||||||||
| // If support is extended to new operations, add tests in | ||||||||||||
| // llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir. | ||||||||||||
|
|
||||||||||||
| bool SIPreEmitPeephole::canUnpackingClobberRegister(MachineInstr &MI) { | ||||||||||||
| unsigned OpCode = MI.getOpcode(); | ||||||||||||
| Register DstReg = MI.getOperand(0).getReg(); | ||||||||||||
| // Only the first register in the register pair needs to be checked due to the | ||||||||||||
|
|
@@ -462,21 +465,9 @@ bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) { | |||||||||||
| // Such scenarios can arise due to specific combinations of op_sel and | ||||||||||||
| // op_sel_hi modifiers. | ||||||||||||
| Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0); | ||||||||||||
|
|
||||||||||||
| const MachineOperand *Src0MO = TII->getNamedOperand(MI, AMDGPU::OpName::src0); | ||||||||||||
| if (Src0MO && Src0MO->isReg()) { | ||||||||||||
| Register SrcReg0 = Src0MO->getReg(); | ||||||||||||
| unsigned Src0Mods = | ||||||||||||
| TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm(); | ||||||||||||
| Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1) | ||||||||||||
| ? TRI->getSubReg(SrcReg0, AMDGPU::sub1) | ||||||||||||
| : TRI->getSubReg(SrcReg0, AMDGPU::sub0); | ||||||||||||
| // Check if the register selected by op_sel_hi is the same as the first | ||||||||||||
| // register in the destination register pair. | ||||||||||||
| if (TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg)) | ||||||||||||
| return true; | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| uint16_t UnpackedOpCode = mapToUnpackedOpcode(MI); | ||||||||||||
| bool UnpackedInstHasOneSrcOp = | ||||||||||||
| !AMDGPU::hasNamedOperand(UnpackedOpCode, AMDGPU::OpName::src1); | ||||||||||||
| const MachineOperand *Src1MO = TII->getNamedOperand(MI, AMDGPU::OpName::src1); | ||||||||||||
| if (Src1MO && Src1MO->isReg()) { | ||||||||||||
| Register SrcReg1 = Src1MO->getReg(); | ||||||||||||
|
|
@@ -485,25 +476,44 @@ bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) { | |||||||||||
| Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1) | ||||||||||||
| ? TRI->getSubReg(SrcReg1, AMDGPU::sub1) | ||||||||||||
| : TRI->getSubReg(SrcReg1, AMDGPU::sub0); | ||||||||||||
| // Check if the register selected by op_sel_hi is the same as the first | ||||||||||||
| // register in the destination register pair. | ||||||||||||
| if (TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg)) | ||||||||||||
| return true; | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| // Applicable for packed instructions with 3 source operands, such as | ||||||||||||
| // V_PK_FMA. | ||||||||||||
| if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) { | ||||||||||||
| const MachineOperand *Src2MO = | ||||||||||||
| TII->getNamedOperand(MI, AMDGPU::OpName::src2); | ||||||||||||
| if (Src2MO && Src2MO->isReg()) { | ||||||||||||
| Register SrcReg2 = Src2MO->getReg(); | ||||||||||||
| unsigned Src2Mods = | ||||||||||||
| TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm(); | ||||||||||||
| Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1) | ||||||||||||
| ? TRI->getSubReg(SrcReg2, AMDGPU::sub1) | ||||||||||||
| : TRI->getSubReg(SrcReg2, AMDGPU::sub0); | ||||||||||||
| if (TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg)) | ||||||||||||
| // V_MOV_B32s have one src operand. Other candidate unpacked instructions with | ||||||||||||
| // 2 or more src operands will perform the following checks. | ||||||||||||
| if (!UnpackedInstHasOneSrcOp) { | ||||||||||||
| const MachineOperand *Src0MO = | ||||||||||||
| TII->getNamedOperand(MI, AMDGPU::OpName::src0); | ||||||||||||
|
||||||||||||
| if (Src0MO && Src0MO->isReg()) { | ||||||||||||
| Register SrcReg0 = Src0MO->getReg(); | ||||||||||||
| unsigned Src0Mods = | ||||||||||||
| TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm(); | ||||||||||||
| Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1) | ||||||||||||
| ? TRI->getSubReg(SrcReg0, AMDGPU::sub1) | ||||||||||||
| : TRI->getSubReg(SrcReg0, AMDGPU::sub0); | ||||||||||||
| if (TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg)) | ||||||||||||
| return true; | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| // Applicable for packed instructions with 3 source operands, such as | ||||||||||||
| // V_PK_FMA. | ||||||||||||
| if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) { | ||||||||||||
| const MachineOperand *Src2MO = | ||||||||||||
| TII->getNamedOperand(MI, AMDGPU::OpName::src2); | ||||||||||||
|
||||||||||||
| if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) { | |
| const MachineOperand *Src2MO = | |
| TII->getNamedOperand(MI, AMDGPU::OpName::src2); | |
| if (const MachineOperand *Src2MO = | |
| TII->getNamedOperand(MI, AMDGPU::OpName::src2); |
akadutta marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know why it's written this way, but you're making a set of redundant conditions worse. The mapToUnpackedOpcode call should be sunk and supersedes all of the other checks here (e.g., why is ib bothering checking isTerminator?)
Not sure what any of this has to do with adding the new case
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove the hasNamedOperand check / fold it together with getNamedOperand like above
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
performF32Unpacking shouldn't fail to unpack. If we have some condition that would cause unpacking to fail, it should be recognized during collection time and trigger collecting to stop. Otherwise, we may fail to unpack an earlier instruction in InstrsToUnpack and successfully / unnecessarily unpack a later instruction.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Check moved to collectUnpackingCandidates
Uh oh!
There was an error while loading. Please reload this page.