@@ -2465,71 +2465,94 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
24652465 // these from the dispatch pointer.
24662466}
24672467
2468+ static bool allocPreloadKernArg(uint64_t &LastExplicitArgOffset,
2469+ uint64_t ExplicitArgOffset, uint64_t ArgOffset,
2470+ unsigned ArgSize, unsigned Idx,
2471+ MachineFunction &MF, const SIRegisterInfo &TRI,
2472+ SIMachineFunctionInfo &Info, CCState &CCInfo) {
2473+ if (ArgOffset >= ExplicitArgOffset)
2474+ return false;
2475+
2476+ const Align KernelArgBaseAlign = Align(16);
2477+ Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2478+ unsigned NumAllocSGPRs = alignTo(ArgSize, 4) / 4;
2479+
2480+ // Arg is preloaded into the previous SGPR.
2481+ if (ArgSize < 4 && Alignment < 4) {
2482+ Info.getArgInfo().PreloadKernArgs[Idx].Regs.push_back(
2483+ Info.getArgInfo().PreloadKernArgs[Idx - 1].Regs[0]);
2484+ return true;
2485+ }
2486+
2487+ unsigned Padding = ArgOffset - LastExplicitArgOffset;
2488+ unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2489+ // Check for free user SGPRs for preloading.
2490+ if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
2491+ Info.getUserSGPRInfo().getNumFreeUserSGPRs()) {
2492+ return false;
2493+ }
2494+
2495+ // Preload this argument.
2496+ const TargetRegisterClass *RC =
2497+ TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2498+ SmallVectorImpl<MCRegister> *PreloadRegs =
2499+ Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, Idx, PaddingSGPRs);
2500+
2501+ if (PreloadRegs->size() > 1)
2502+ RC = &AMDGPU::SGPR_32RegClass;
2503+
2504+ for (auto &Reg : *PreloadRegs) {
2505+ assert(Reg);
2506+ MF.addLiveIn(Reg, RC);
2507+ CCInfo.AllocateReg(Reg);
2508+ }
2509+
2510+ LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2511+ return true;
2512+ }
2513+
24682514// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
24692515// sequential starting from the first argument.
24702516void SITargetLowering::allocatePreloadKernArgSGPRs(
2471- CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2472- const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
2517+ CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs, MachineFunction &MF,
24732518 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
24742519 Function &F = MF.getFunction();
2475- unsigned LastExplicitArgOffset =
2476- MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
2477- GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2478- bool InPreloadSequence = true;
2479- unsigned InIdx = 0;
2520+ const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset();
2521+ uint64_t ExplicitArgOffset = BaseOffset;
2522+ uint64_t LastExplicitArgOffset = ExplicitArgOffset;
2523+ unsigned LocIdx = 0;
24802524 for (auto &Arg : F.args()) {
2481- if (!InPreloadSequence || !Arg.hasInRegAttr())
2525+ const DataLayout &DL = F.getParent()->getDataLayout();
2526+ const bool IsByRef = Arg.hasByRefAttr();
2527+ Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
2528+ unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
2529+ if (AllocSize == 0)
24822530 break;
24832531
2484- int ArgIdx = Arg.getArgNo() ;
2485- // Don't preload non-original args or parts not in the current preload
2486- // sequence.
2487- if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2488- (int)Ins[InIdx].getOrigArgIndex() != ArgIdx ))
2532+ MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt ;
2533+ Align ABIAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);
2534+ uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
2535+ ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
2536+ if (!Arg.hasInRegAttr( ))
24892537 break;
24902538
2491- for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2492- (int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2493- InIdx++) {
2494- assert(ArgLocs[ArgIdx].isMemLoc());
2495- auto &ArgLoc = ArgLocs[InIdx];
2496- const Align KernelArgBaseAlign = Align(16);
2497- unsigned ArgOffset = ArgLoc.getLocMemOffset();
2498- Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2499- unsigned NumAllocSGPRs =
2500- alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2501-
2502- // Arg is preloaded into the previous SGPR.
2503- if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2504- Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2505- Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2506- continue;
2507- }
2508-
2509- unsigned Padding = ArgOffset - LastExplicitArgOffset;
2510- unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2511- // Check for free user SGPRs for preloading.
2512- if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
2513- SGPRInfo.getNumFreeUserSGPRs()) {
2514- InPreloadSequence = false;
2515- break;
2516- }
2517-
2518- // Preload this argument.
2519- const TargetRegisterClass *RC =
2520- TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2521- SmallVectorImpl<MCRegister> *PreloadRegs =
2522- Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2523-
2524- if (PreloadRegs->size() > 1)
2525- RC = &AMDGPU::SGPR_32RegClass;
2526- for (auto &Reg : *PreloadRegs) {
2527- assert(Reg);
2528- MF.addLiveIn(Reg, RC);
2529- CCInfo.AllocateReg(Reg);
2539+ if (!ArgLocs.size()) {
2540+ // global isel
2541+ allocPreloadKernArg(LastExplicitArgOffset, ExplicitArgOffset,
2542+ ArgOffset, AllocSize, Arg.getArgNo(), MF,
2543+ TRI, Info, CCInfo);
2544+ } else {
2545+ // DAG isel
2546+ for (; LocIdx < ArgLocs.size(); LocIdx++) {
2547+ CCValAssign &ArgLoc = ArgLocs[LocIdx];
2548+ assert(ArgLoc.isMemLoc());
2549+ uint64_t LocOffset = ArgLoc.getLocMemOffset();
2550+ unsigned LocSize = ArgLoc.getLocVT().getStoreSize();
2551+ if (!allocPreloadKernArg(LastExplicitArgOffset, ExplicitArgOffset,
2552+ LocOffset, LocSize, LocIdx, MF, TRI, Info,
2553+ CCInfo))
2554+ break;
25302555 }
2531-
2532- LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
25332556 }
25342557 }
25352558}
@@ -2854,7 +2877,7 @@ SDValue SITargetLowering::LowerFormalArguments(
28542877 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
28552878 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
28562879 if (IsKernel && Subtarget->hasKernargPreload())
2857- allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2880+ allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, MF, *TRI, *Info);
28582881
28592882 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
28602883 } else if (!IsGraphics) {
0 commit comments