@@ -4970,11 +4970,21 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
49704970 const SIRegisterInfo *TRI = ST.getRegisterInfo();
49714971 const DebugLoc &DL = MI.getDebugLoc();
49724972 const SIInstrInfo *TII = ST.getInstrInfo();
4973-
4973+ const MachineFunction *MF = BB.getParent();
4974+ const TargetRegisterInfo *TrgtRegInfo = MF->getSubtarget().getRegisterInfo();
49744975 // Reduction operations depend on whether the input operand is SGPR or VGPR.
49754976 Register SrcReg = MI.getOperand(1).getReg();
4976- bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4977+ auto SrcRegClass = MRI.getRegClass(SrcReg);
4978+ llvm::errs() << TrgtRegInfo->getRegClassName(SrcRegClass) << "\n";
4979+ bool isSGPR = TRI->isSGPRClass(SrcRegClass);
49774980 Register DstReg = MI.getOperand(0).getReg();
4981+ llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(DstReg)) << "\n";
4982+ Register MaskReg = MI.getOperand(2).getReg();
4983+ llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(MaskReg)) << "\n";
4984+
4985+ // llvm::errs() << "srcreg:" << MRI.getRegClassName(MRI.getRegClass(SrcReg)) << "\n";
4986+ // llvm::errs() << "DstReg:" << MRI.getRegClassName(MRI.getRegClass(DstReg)) << "\n";
4987+ // llvm::errs() << "MaskReg:" << MRI.getRegClassName(MRI.getRegClass(MaskReg)) << "\n";
49784988 MachineBasicBlock *RetBB = nullptr;
49794989 if (isSGPR) {
49804990 // These operations with a uniform value i.e. SGPR are idempotent.
@@ -5005,15 +5015,19 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
50055015 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
50065016 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
50075017 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5008- Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
5018+ Register InitalValReg = MRI.createVirtualRegister(DstRegClass);//MRI.getRegClass(SrcReg)
50095019
5010- Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5020+ Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);//MRI.getRegClass(SrcReg)
50115021 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
50125022 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5023+ Register TempRegMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
50135024
50145025 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
5026+ Register FF1MaskReg = MRI.createVirtualRegister(DstRegClass);
50155027 Register LaneValueReg =
50165028 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5029+ Register MaskLaneValueReg =
5030+ MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
50175031
50185032 bool IsWave32 = ST.isWave32();
50195033 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
@@ -5024,9 +5038,11 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
50245038 uint32_t InitalValue =
50255039 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
50265040 auto TmpSReg =
5027- BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
5041+ BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg); //s_mov_b64 s[2:3], exec
5042+ // auto TmpMaskSReg =
5043+ // BuildMI(BB, I, DL, TII->get(MovOpc), TempRegMaskReg).addReg(MaskReg); //s_mov_b64 s[2:3], exec
50285044 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5029- .addImm(InitalValue);
5045+ .addImm(InitalValue);//s_mov_b32 s4, 0 | %17:sgpr_32 = S_MOV_B32 0
50305046 // clang-format off
50315047 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
50325048 .addMBB(ComputeLoop);
@@ -5046,22 +5062,28 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
50465062 // Perform the computations
50475063 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
50485064 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5049- .addReg(ActiveBits->getOperand(0).getReg());
5065+ .addReg(ActiveBits->getOperand(0).getReg());//%index.sgpr = S_FF1_I32_B64 %exec_copy.sreg
50505066 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
50515067 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
50525068 .addReg(SrcReg)
5053- .addReg(FF1->getOperand(0).getReg());
5069+ .addReg(FF1->getOperand(0).getReg());//%value_at_lane_index.sreg = V_READLANE %value.vgpr %index.sgpr
5070+ auto MaskLaneValue = BuildMI(*ComputeLoop, I, DL,
5071+ TII->get(AMDGPU::V_READLANE_B32), MaskLaneValueReg)
5072+ .addReg(MaskReg)
5073+ .addReg(FF1->getOperand(0).getReg());//%mask_at_lane_index.sreg = V_READLANE %mask.vgpr %index.sgpr
5074+ auto FF2 = BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_FF1_I32_B64), FF1Reg)
5075+ .addReg(MaskLaneValue->getOperand(0).getReg());//%subgroupindex.sgpr = S_FF1_I32_B64 %mask_at_lane_index.sreg
50545076 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
50555077 .addReg(Accumulator->getOperand(0).getReg())
5056- .addReg(LaneValue->getOperand(0).getReg());
5078+ .addReg(LaneValue->getOperand(0).getReg());//%acc.sgpr = max %acc.sgpr %value_at_lane_index.sreg
50575079
50585080 // Manipulate the iterator to get the next active lane
50595081 unsigned BITSETOpc =
50605082 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
50615083 auto NewActiveBits =
50625084 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
50635085 .addReg(FF1->getOperand(0).getReg())
5064- .addReg(ActiveBits->getOperand(0).getReg());
5086+ .addReg(ActiveBits->getOperand(0).getReg());//%bitsetresult = S_BITSET0_B64 %exec_copy
50655087
50665088 // Add phi nodes
50675089 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
0 commit comments