-
Notifications
You must be signed in to change notification settings - Fork 15.3k
AMDGPU: Use RegClassByHwMode to manage GWS operand special case #169373
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Use RegClassByHwMode to manage GWS operand special case #169373
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
|
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesOn targets that require even aligned 64-bit VGPRs, GWS operands This also should be applied to the image special case. Patch is 34.20 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169373.diff 13 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 6c36f8ad9b6a9..78a3ec7f0c266 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3080,9 +3080,38 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
const unsigned Opc = gwsIntrinToOpcode(IntrID);
+
+ const MCInstrDesc &InstrDesc = TII->get(Opc);
+ int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
+
+ const TargetRegisterClass *DataRC = TII->getRegClass(InstrDesc, Data0Idx);
+
SmallVector<SDValue, 5> Ops;
- if (HasVSrc)
- Ops.push_back(N->getOperand(2));
+ if (HasVSrc) {
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+
+ SDValue Data = N->getOperand(2);
+ MVT DataVT = Data.getValueType().getSimpleVT();
+ if (TRI->isTypeLegalForClass(*DataRC, DataVT)) {
+ // Normal 32-bit case.
+ Ops.push_back(N->getOperand(2));
+ } else {
+ // Operand is really 32-bits, but requires 64-bit alignment, so use the
+ // even aligned 64-bit register class.
+ const SDValue RegSeqOps[] = {
+ CurDAG->getTargetConstant(DataRC->getID(), SL, MVT::i32), Data,
+ CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
+ SDValue(
+ CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL, MVT::i32),
+ 0),
+ CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32)};
+
+ Ops.push_back(SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
+ SL, MVT::v2i32, RegSeqOps),
+ 0));
+ }
+ }
+
Ops.push_back(OffsetField);
Ops.push_back(Chain);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 650df2a87506a..c575714cf61cd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1946,20 +1946,52 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
// The resource id offset is computed as (<isa opaque base> + M0[21:16] +
// offset field) % 64. Some versions of the programming guide omit the m0
// part, or claim it's from offset 0.
- auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
+
+ unsigned Opc = gwsIntrinToOpcode(IID);
+ const MCInstrDesc &InstrDesc = TII.get(Opc);
if (HasVSrc) {
Register VSrc = MI.getOperand(1).getReg();
- MIB.addReg(VSrc);
- if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
- return false;
- }
+ int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
+ const TargetRegisterClass *DataRC = TII.getRegClass(InstrDesc, Data0Idx);
+ const TargetRegisterClass *SubRC =
+ TRI.getSubRegisterClass(DataRC, AMDGPU::sub0);
+
+ if (!SubRC) {
+ // 32-bit normal case.
+ if (!RBI.constrainGenericRegister(VSrc, *DataRC, *MRI))
+ return false;
- MIB.addImm(ImmOffset)
- .cloneMemRefs(MI);
+ BuildMI(*MBB, &MI, DL, InstrDesc)
+ .addReg(VSrc)
+ .addImm(ImmOffset)
+ .cloneMemRefs(MI);
+ } else {
+ // Requires even register alignment, so create 64-bit value and pad the
+ // top half with undef.
+ Register DataReg = MRI->createVirtualRegister(DataRC);
+ if (!RBI.constrainGenericRegister(VSrc, *SubRC, *MRI))
+ return false;
- TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
+ Register UndefReg = MRI->createVirtualRegister(SubRC);
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), DataReg)
+ .addReg(VSrc)
+ .addImm(AMDGPU::sub0)
+ .addReg(UndefReg)
+ .addImm(AMDGPU::sub1);
+
+ BuildMI(*MBB, &MI, DL, InstrDesc)
+ .addReg(DataReg)
+ .addImm(ImmOffset)
+ .cloneMemRefs(MI);
+ }
+ } else {
+ BuildMI(*MBB, &MI, DL, InstrDesc)
+ .addImm(ImmOffset)
+ .cloneMemRefs(MI);
+ }
MI.eraseFromParent();
return true;
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 68060553e558c..7a91a40e18cde 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -347,6 +347,11 @@ class AMDGPUOperand : public MCParsedAsmOperand {
return isRegKind() && getReg() == AMDGPU::SGPR_NULL;
}
+ bool isAV_LdSt_32_Align2_RegOp() const {
+ return isRegClass(AMDGPU::VGPR_32RegClassID) ||
+ isRegClass(AMDGPU::AGPR_32RegClassID);
+ }
+
bool isVRegWithInputMods() const;
template <bool IsFake16> bool isT16_Lo128VRegWithInputMods() const;
template <bool IsFake16> bool isT16VRegWithInputMods() const;
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index b841171c285d8..040a7112d29c3 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -463,7 +463,7 @@ class DS_GWS_0D <string opName>
class DS_GWS_1D <string opName>
: DS_GWS<opName,
- (ins AVLdSt_32:$data0, Offset:$offset),
+ (ins AV_LdSt_32_Align2_RegOp:$data0, Offset:$offset),
" $data0$offset gds"> {
let has_gws_data0 = 1;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index da287e0243d71..b63d71dc2fde9 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -491,6 +491,18 @@ void AMDGPUInstPrinter::printVINTRPDst(const MCInst *MI, unsigned OpNo,
printRegularOperand(MI, OpNo, STI, O);
}
+void AMDGPUInstPrinter::printAVLdSt32Align2RegOp(const MCInst *MI,
+ unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ MCRegister Reg = MI->getOperand(OpNo).getReg();
+
+ // On targets with an even alignment requirement
+ if (MCRegister SubReg = MRI.getSubReg(Reg, AMDGPU::sub0))
+ Reg = SubReg;
+ printRegOperand(Reg, O, MRI);
+}
+
void AMDGPUInstPrinter::printImmediateInt16(uint32_t Imm,
const MCSubtargetInfo &STI,
raw_ostream &O) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index b27295e73ec99..564d6eea52328 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -77,6 +77,9 @@ class AMDGPUInstPrinter : public MCInstPrinter {
raw_ostream &O);
void printVINTRPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printAVLdSt32Align2RegOp(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
void printImmediateInt16(uint32_t Imm, const MCSubtargetInfo &STI,
raw_ostream &O);
void printImmediateBF16(uint32_t Imm, const MCSubtargetInfo &STI,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4fc397b51f1b1..9bb786985747d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6429,8 +6429,6 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case AMDGPU::DS_GWS_INIT:
case AMDGPU::DS_GWS_SEMA_BR:
case AMDGPU::DS_GWS_BARRIER:
- TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
- [[fallthrough]];
case AMDGPU::DS_GWS_SEMA_V:
case AMDGPU::DS_GWS_SEMA_P:
case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index c66985a19685b..b0c0223211340 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1653,6 +1653,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
const TargetSchedModel &getSchedModel() const { return SchedModel; }
+ // FIXME: This should be removed
// Enforce operand's \p OpName even alignment if required by target.
// This is used if an operand is a 32 bit register but needs to be aligned
// regardless.
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 5cff5f2248b02..272d4b5609dfb 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1328,6 +1328,17 @@ def VS_64_AlignTarget : SIRegisterClassLike<64, true, false, true>,
let DecoderMethod = "decodeSrcRegOrImm9";
}
+
+// Special case for DS_GWS instructions. The register input is really
+// 32-bit, but it needs to be even aligned on targets with a VGPR
+// alignment requirement.
+def AV_LdSt_32_Align2 : SIRegisterClassLike</*Bitwidth=*/32, /*VGPR=*/true, /*AGPR=*/true>,
+ RegClassByHwMode<
+ [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32],
+ [VGPR_32, VGPR_32, AV_64_Align2, VReg_64_Align2, VReg_64_Align2]> {
+ let DecoderMethod = "decodeAVLdSt<32>";
+}
+
class RegImmMatcher<string name> : AsmOperandClass {
let Name = name;
let RenderMethod = "addRegOrImmOperands";
@@ -1580,6 +1591,17 @@ foreach size = ["64", "96", "128", "160", "256", "1024" ] in {
def AVLdSt_#size#_Align2 : AVLdStOperand<!cast<RegisterClassLike>("AV_LdSt_"#size#_Align2)>;
}
+def AV_LdSt_32_Align2_RegMatcher : AsmOperandClass {
+ let Name = "AV_LdSt_32_Align2_RegOp";
+ let RenderMethod = "addRegOperands";
+}
+
+def AV_LdSt_32_Align2_RegOp : RegisterOperand<AV_LdSt_32_Align2> {
+ let ParserMatchClass = AV_LdSt_32_Align2_RegMatcher;
+ let PrintMethod = "printAVLdSt32Align2RegOp";
+ let EncoderMethod = "getAVOperandEncoding";
+}
+
//===----------------------------------------------------------------------===//
// ACSrc_* Operands with an AGPR or an inline constant
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/gws_agpr.ll b/llvm/test/CodeGen/AMDGPU/gws_agpr.ll
index 2082a519d4f83..d87dac1d69047 100644
--- a/llvm/test/CodeGen/AMDGPU/gws_agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/gws_agpr.ll
@@ -3,128 +3,72 @@
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=CHECK,GISEL %s
define void @gws_init_offset0() #0 {
-; SDAG-LABEL: gws_init_offset0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: ;;#ASMSTART
-; SDAG-NEXT: ; def a0
-; SDAG-NEXT: ;;#ASMEND
-; SDAG-NEXT: s_mov_b32 m0, 0
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: ds_gws_init a0 gds
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: gws_init_offset0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: ;;#ASMSTART
-; GISEL-NEXT: ; def a0
-; GISEL-NEXT: ;;#ASMEND
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: s_mov_b32 m0, 0
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: ds_gws_init v0 gds
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; CHECK-LABEL: gws_init_offset0:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_mov_b32 m0, 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: ds_gws_init a0 gds
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
%val = call i32 asm "; def $0", "=a"()
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0)
ret void
}
define void @gws_init_offset63() #0 {
-; SDAG-LABEL: gws_init_offset63:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: ;;#ASMSTART
-; SDAG-NEXT: ; def a0
-; SDAG-NEXT: ;;#ASMEND
-; SDAG-NEXT: s_mov_b32 m0, 0
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: ds_gws_init a0 offset:63 gds
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: gws_init_offset63:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: ;;#ASMSTART
-; GISEL-NEXT: ; def a0
-; GISEL-NEXT: ;;#ASMEND
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: s_mov_b32 m0, 0
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: ds_gws_init v0 offset:63 gds
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; CHECK-LABEL: gws_init_offset63:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_mov_b32 m0, 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: ds_gws_init a0 offset:63 gds
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
%val = call i32 asm "; def $0", "=a"()
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 63)
ret void
}
define void @gws_init_sgpr_offset(i32 inreg %offset) #0 {
-; SDAG-LABEL: gws_init_sgpr_offset:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: ;;#ASMSTART
-; SDAG-NEXT: ; def a0
-; SDAG-NEXT: ;;#ASMEND
-; SDAG-NEXT: s_lshl_b32 m0, s16, 16
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: ds_gws_init a0 gds
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: gws_init_sgpr_offset:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: ;;#ASMSTART
-; GISEL-NEXT: ; def a0
-; GISEL-NEXT: ;;#ASMEND
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: s_lshl_b32 m0, s16, 16
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: ds_gws_init v0 gds
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; CHECK-LABEL: gws_init_sgpr_offset:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_lshl_b32 m0, s16, 16
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: ds_gws_init a0 gds
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
%val = call i32 asm "; def $0", "=a"()
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset)
ret void
}
define amdgpu_kernel void @gws_init_agpr_offset() #0 {
-; SDAG-LABEL: gws_init_agpr_offset:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: ;;#ASMSTART
-; SDAG-NEXT: ; def a1
-; SDAG-NEXT: ;;#ASMEND
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a1
-; SDAG-NEXT: v_readfirstlane_b32 s0, v0
-; SDAG-NEXT: ;;#ASMSTART
-; SDAG-NEXT: ; def a0
-; SDAG-NEXT: ;;#ASMEND
-; SDAG-NEXT: s_lshl_b32 m0, s0, 16
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: ds_gws_init a0 gds
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: s_endpgm
-;
-; GISEL-LABEL: gws_init_agpr_offset:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: ;;#ASMSTART
-; GISEL-NEXT: ; def a1
-; GISEL-NEXT: ;;#ASMEND
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a1
-; GISEL-NEXT: v_readfirstlane_b32 s0, v0
-; GISEL-NEXT: ;;#ASMSTART
-; GISEL-NEXT: ; def a0
-; GISEL-NEXT: ;;#ASMEND
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a0
-; GISEL-NEXT: s_lshl_b32 m0, s0, 16
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: ds_gws_init v2 gds
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_endpgm
+; CHECK-LABEL: gws_init_agpr_offset:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v0, a1
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_lshl_b32 m0, s0, 16
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: ds_gws_init a0 gds
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_endpgm
%val = call i32 asm "; def $0", "=a"()
%offset = call i32 asm "; def $0", "=a"()
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset)
@@ -132,40 +76,22 @@ define amdgpu_kernel void @gws_init_agpr_offset() #0 {
}
define void @gws_init_agpr_offset_add1() #0 {
-; SDAG-LABEL: gws_init_agpr_offset_add1:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: ;;#ASMSTART
-; SDAG-NEXT: ; def a1
-; SDAG-NEXT: ;;#ASMEND
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a1
-; SDAG-NEXT: v_readfirstlane_b32 s4, v0
-; SDAG-NEXT: ;;#ASMSTART
-; SDAG-NEXT: ; def a0
-; SDAG-NEXT: ;;#ASMEND
-; SDAG-NEXT: s_lshl_b32 m0, s4, 16
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: ds_gws_init a0 offset:1 gds
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: gws_init_agpr_offset_add1:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: ;;#ASMSTART
-; GISEL-NEXT: ; def a1
-; GISEL-NEXT: ;;#ASMEND
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a1
-; GISEL-NEXT: v_readfirstlane_b32 s4, v0
-; GISEL-NEXT: ;;#ASMSTART
-; GISEL-NEXT: ; def a0
-; GISEL-NEXT: ;;#ASMEND
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a0
-; GISEL-NEXT: s_lshl_b32 m0, s4, 16
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: ds_gws_init v2 offset:1 gds
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; CHECK-LABEL: gws_init_agpr_offset_add1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v0, a1
+; CHECK-NEXT: v_readfirstlane_b32 s4, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_lshl_b32 m0, s4, 16
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: ds_gws_init a0 offset:1 gds
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
%val = call i32 asm "; def $0", "=a"()
%offset.base = call i32 asm "; def $0", "=a"()
%offset = add i32 %offset.base, 1
@@ -195,90 +121,51 @@ define amdgpu_kernel void @gws_init_vgpr_offset_add(i32 %val) #0 {
}
define void @gws_barrier_offset0() #0 {
-; SDAG-LABEL: gws_barrier_offset0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: ;;#ASMSTART
-; SDAG-NEXT: ; def a0
-; SDAG-NEXT: ;;#ASMEND
-; SDAG-NEXT: s_mov_b32 m0, 0
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: ds_gws_barrier a0 gds
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: gws_barrier_offset0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: ;;#ASMSTART
-; GISEL-NEXT: ; def a0
-; GISEL-NEXT: ;;#ASMEND
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: s_mov_b32 m0, 0
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: ds_gws_barrier v0 gds
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; CHECK-LABEL: gws_barrier_offset0:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_mov_b32 m0, 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: ds_gws_barrier a0 gds
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
%val = call i32 asm "; def $0", "=a"()
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0)
ret void
}
define void @gws_barrier_offset63() #0 {
-; ...
[truncated]
|
You can test this locally with the following command:git-clang-format --diff origin/main HEAD --extensions h,cpp -- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h llvm/lib/Target/AMDGPU/SIISelLowering.cpp llvm/lib/Target/AMDGPU/SIInstrInfo.h --diff_from_common_commit
View the diff from clang-format here.diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index c575714cf..339501cf7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1964,9 +1964,9 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
return false;
BuildMI(*MBB, &MI, DL, InstrDesc)
- .addReg(VSrc)
- .addImm(ImmOffset)
- .cloneMemRefs(MI);
+ .addReg(VSrc)
+ .addImm(ImmOffset)
+ .cloneMemRefs(MI);
} else {
// Requires even register alignment, so create 64-bit value and pad the
// top half with undef.
@@ -1977,20 +1977,18 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
Register UndefReg = MRI->createVirtualRegister(SubRC);
BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), DataReg)
- .addReg(VSrc)
- .addImm(AMDGPU::sub0)
- .addReg(UndefReg)
- .addImm(AMDGPU::sub1);
+ .addReg(VSrc)
+ .addImm(AMDGPU::sub0)
+ .addReg(UndefReg)
+ .addImm(AMDGPU::sub1);
BuildMI(*MBB, &MI, DL, InstrDesc)
- .addReg(DataReg)
- .addImm(ImmOffset)
- .cloneMemRefs(MI);
+ .addReg(DataReg)
+ .addImm(ImmOffset)
+ .cloneMemRefs(MI);
}
} else {
- BuildMI(*MBB, &MI, DL, InstrDesc)
- .addImm(ImmOffset)
- .cloneMemRefs(MI);
+ BuildMI(*MBB, &MI, DL, InstrDesc).addImm(ImmOffset).cloneMemRefs(MI);
}
MI.eraseFromParent();
|
shiltian
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That is a nice improvement.
| let RenderMethod = "addRegOperands"; | ||
| } | ||
|
|
||
| def AV_LdSt_32_Align2_RegOp : RegisterOperand<AV_LdSt_32_Align2> { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if I could do something similar to the pseudo register class that I added for downstream support such that I don't need to do a check on register class?
On targets that require even aligned 64-bit VGPRs, GWS operands require even alignment of a 32-bit operand. Previously we had a hacky post-processing which added an implicit operand to try to manage the constraint. This would require special casing in other passes to avoid breaking the operand constraint. This moves the handling into the instruction definition, so other passes no longer need to consider this edge case. MC still does need to special case this, to print/parse as a 32-bit register. This also still ends up net less work than introducing even aligned 32-bit register classes. This also should be applied to the image special case.
40e1aad to
5c6608c
Compare

On targets that require even aligned 64-bit VGPRs, GWS operands
require even alignment of a 32-bit operand. Previously we had a hacky
post-processing which added an implicit operand to try to manage
the constraint. This would require special casing in other passes
to avoid breaking the operand constraint. This moves the handling
into the instruction definition, so other passes no longer need
to consider this edge case. MC still does need to special case this,
to print/parse as a 32-bit register. This also still ends up net
less work than introducing even aligned 32-bit register classes.
This also should be applied to the image special case.