Skip to content

Commit a6d2d4c

Browse files
committed
Merge amd-gfx13 and amd-gfx into amd-gfx-gfx13
3 parents 5853c73 + 1cf5cb3 + b7714b7 commit a6d2d4c

13 files changed

+303
-110
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2143,6 +2143,21 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffsetM0(SDNode *N, SDValue Addr,
21432143
return true;
21442144
}
21452145

2146+
bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoScaleOffsetM0(
2147+
SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset,
2148+
SDValue &CPol) const {
2149+
bool ScaleOffset;
2150+
if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset, true,
2151+
false))
2152+
return false;
2153+
2154+
// We are assuming CPol is second from last operand of the intrinsic.
2155+
auto PassedCPol =
2156+
N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
2157+
CPol = CurDAG->getTargetConstant(PassedCPol, SDLoc(), MVT::i32);
2158+
return true;
2159+
}
2160+
21462161
bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffsetScaleOffsetM0(
21472162
SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset,
21482163
SDValue &CPol) const {
@@ -3182,8 +3197,9 @@ void AMDGPUDAGToDAGISel::SelectLOAD_MCAST(MemIntrinsicSDNode *N,
31823197
}
31833198
case AMDGPUAS::DISTRIBUTED: {
31843199
// Choose best addressing mode
3185-
if (SelectGlobalSAddrCPolM0(N, N->getOperand(3) /*Addr*/, V0 /*SAddr*/,
3186-
V1 /*VOffset*/, V2 /*Offset*/, V3 /*CPol*/)) {
3200+
if (SelectGlobalSAddrNoScaleOffsetM0(N, N->getOperand(3) /*Addr*/,
3201+
V0 /*SAddr*/, V1 /*VOffset*/,
3202+
V2 /*Offset*/, V3 /*CPol*/)) {
31873203
MCastOps.push_back(V0);
31883204
MCastOps.push_back(V1);
31893205
MCastOps.push_back(V2);

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,9 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
182182
SDValue &VOffset, SDValue &CPol) const;
183183
bool SelectGlobalSAddrNoIOffsetM0(SDNode *N, SDValue Addr, SDValue &SAddr,
184184
SDValue &VOffset, SDValue &CPol) const;
185+
bool SelectGlobalSAddrNoScaleOffsetM0(SDNode *N, SDValue Addr, SDValue &SAddr,
186+
SDValue &VOffset, SDValue &Offset,
187+
SDValue &CPol) const;
185188
bool SelectGlobalSAddrNoScaleOffset(SDNode *N, SDValue Addr, SDValue &SAddr,
186189
SDValue &VOffset, SDValue &Offset,
187190
SDValue &CPol) const;

llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp

Lines changed: 69 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -953,43 +953,65 @@ class AMDGPULowerModuleLDS {
953953
return NewGV;
954954
}
955955

956+
/// Assigns an absolute address for special kinds of GVs like semaphores and
957+
/// barriers. Does this in two rounds: first by assigning a module-absolute
958+
/// address for any GV that is indirectly used by more than one kernel, and
959+
/// second by computing a kernel relative assignment for any GVs remaining.
956960
bool lowerSpecialLDSVariables(
957961
Module &M, LDSUsesInfoTy &LDSUsesInfo,
958962
VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
959963
bool Changed = false;
960-
constexpr unsigned NumScopes =
961-
static_cast<unsigned>(Barrier::Scope::NUM_SCOPES);
962964
const DataLayout &DL = M.getDataLayout();
965+
966+
unsigned NumSemAbsolutes[MAX_WAVES_PER_WAVEGROUP] = {0};
967+
constexpr unsigned NumBarScopes =
968+
static_cast<unsigned>(Barrier::Scope::NUM_SCOPES);
969+
unsigned NumBarAbsolutes[NumBarScopes] = {0};
970+
963971
// The 1st round: give module-absolute assignments
964-
unsigned NumAbsolutes[NumScopes] = {0};
965972
std::vector<GlobalVariable *> OrderedGVs;
966973
for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
967974
GlobalVariable *GV = K.first;
968-
if (!isNamedBarrier(*GV))
975+
if (!(isNamedBarrier(*GV) || isLDSSemaphore(*GV)))
969976
continue;
970-
// give a module-absolute assignment if it is indirectly accessed by
977+
978+
// Give a module-absolute assignment if it is indirectly accessed by
971979
// multiple kernels. This is not precise, but we don't want to duplicate
972980
// a function when it is called by multiple kernels.
973981
if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
974982
OrderedGVs.push_back(GV);
975983
} else {
976-
// leave it to the 2nd round, which will give a kernel-relative
977-
// assignment if it is only indirectly accessed by one kernel
984+
// Leave it to the 2nd round, which will give a kernel-relative
985+
// assignment if it is only indirectly accessed by one kernel.
978986
LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
979987
}
980988
LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
981989
}
982990
OrderedGVs = sortByName(std::move(OrderedGVs));
983991
for (GlobalVariable *GV : OrderedGVs) {
984-
TargetExtType *ExtTy = isNamedBarrier(*GV);
985-
unsigned BarrierScope = ExtTy->getIntParameter(0);
986-
unsigned BarId = NumAbsolutes[BarrierScope] + 1;
987-
unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
988-
NumAbsolutes[BarrierScope] += BarCnt;
989-
990-
// 4 bits for alignment, 5 bits for the barrier num,
991-
// 3 bits for the barrier scope
992-
unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
992+
unsigned Offset;
993+
if (TargetExtType *ExtTy = isNamedBarrier(*GV)) {
994+
unsigned BarrierScope = ExtTy->getIntParameter(0);
995+
unsigned BarId = NumBarAbsolutes[BarrierScope] + 1;
996+
unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
997+
NumBarAbsolutes[BarrierScope] += BarCnt;
998+
999+
// 4 bits for alignment, 5 bits for the barrier num,
1000+
// 3 bits for the barrier scope
1001+
Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
1002+
1003+
} else if (TargetExtType *ExtTy = isLDSSemaphore(*GV)) {
1004+
unsigned OwningRank = ExtTy->getIntParameter(0);
1005+
assert(OwningRank < MAX_WAVES_PER_WAVEGROUP);
1006+
unsigned Num = ++NumSemAbsolutes[OwningRank];
1007+
1008+
// 4 bits for alignment, 4 bits for the semaphore num,
1009+
// 4 bits for the owning rank
1010+
Offset = 0x801000u | OwningRank << 8 | Num << 4;
1011+
1012+
} else
1013+
llvm_unreachable("Unhandled special variable type.");
1014+
9931015
recordLDSAbsoluteAddress(&M, GV, Offset);
9941016
}
9951017
OrderedGVs.clear();
@@ -1005,32 +1027,52 @@ class AMDGPULowerModuleLDS {
10051027
}
10061028
OrderedKernels = sortByName(std::move(OrderedKernels));
10071029

1008-
DenseMap<Function *, unsigned> Kernel2BarId[NumScopes];
1030+
DenseMap<Function *, unsigned> Kernel2BarId[NumBarScopes];
1031+
DenseMap<Function *, unsigned> Kernel2SemRelative[MAX_WAVES_PER_WAVEGROUP];
10091032
for (Function *F : OrderedKernels) {
1033+
1034+
// Collect all globals for each kernel.
10101035
for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
1011-
if (!isNamedBarrier(*GV))
1036+
if (!(isNamedBarrier(*GV) || isLDSSemaphore(*GV)))
10121037
continue;
10131038

10141039
LDSUsesInfo.direct_access[F].erase(GV);
10151040
if (GV->isAbsoluteSymbolRef()) {
1016-
// already assigned
1041+
// Already assigned.
10171042
continue;
10181043
}
10191044
OrderedGVs.push_back(GV);
10201045
}
1046+
10211047
OrderedGVs = sortByName(std::move(OrderedGVs));
10221048
for (GlobalVariable *GV : OrderedGVs) {
10231049
// GV could also be used directly by other kernels. If so, we need to
10241050
// create a new GV used only by this kernel and its function.
10251051
auto NewGV = uniquifyGVPerKernel(M, GV, F);
10261052
Changed |= (NewGV != GV);
1027-
TargetExtType *ExtTy = isNamedBarrier(*GV);
1028-
unsigned BarrierScope = ExtTy->getIntParameter(0);
1029-
unsigned BarId = Kernel2BarId[BarrierScope][F];
1030-
BarId += NumAbsolutes[BarrierScope] + 1;
1031-
unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
1032-
Kernel2BarId[BarrierScope][F] += BarCnt;
1033-
unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
1053+
unsigned Offset;
1054+
if (TargetExtType *ExtTy = isNamedBarrier(*GV)) {
1055+
// Place each barrier in the next open slot above the module-relative
1056+
// and already assigned kernel-relative barriers.
1057+
unsigned BarrierScope = ExtTy->getIntParameter(0);
1058+
unsigned BarId = Kernel2BarId[BarrierScope][F];
1059+
BarId += NumBarAbsolutes[BarrierScope] + 1;
1060+
unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
1061+
Kernel2BarId[BarrierScope][F] += BarCnt;
1062+
Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
1063+
1064+
} else if (TargetExtType *ExtTy = isLDSSemaphore(*GV)) {
1065+
// Determine which semaphore GVs were already assigned, and for the
1066+
// remaining ones assign the semaphore nums above.
1067+
unsigned OwningRank =
1068+
ExtTy->getIntParameter(0) % MAX_WAVES_PER_WAVEGROUP;
1069+
unsigned Num = NumSemAbsolutes[OwningRank];
1070+
Kernel2SemRelative[OwningRank][F]++;
1071+
Num += Kernel2SemRelative[OwningRank][F];
1072+
Offset = 0x801000u | OwningRank << 8 | Num << 4;
1073+
1074+
} else
1075+
llvm_unreachable("Unhandled special variable type.");
10341076
recordLDSAbsoluteAddress(&M, NewGV, Offset);
10351077
}
10361078
OrderedGVs.clear();
@@ -1039,7 +1081,7 @@ class AMDGPULowerModuleLDS {
10391081
for (auto &K : LDSUsesInfo.indirect_access) {
10401082
assert(isKernelLDS(K.first));
10411083
for (GlobalVariable *GV : K.second) {
1042-
if (isNamedBarrier(*GV))
1084+
if (isNamedBarrier(*GV) || isLDSSemaphore(*GV))
10431085
K.second.erase(GV);
10441086
}
10451087
}

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -116,13 +116,15 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
116116
}
117117

118118
if (TargetExtType *TTy = AMDGPU::isLDSSemaphore(GV)) {
119-
unsigned OwningRank = TTy->getIntParameter(0) % MAX_WAVES_PER_WAVEGROUP;
120-
unsigned Num = ++NumSemaphores[OwningRank];
121-
Offset = 0x801000u | OwningRank << 8 | Num << 4;
122119
// TODO-GFX13: Diagnose trying to allocate more than the 5 semaphores
123120
// supported by hardware.
124-
Entry.first->second = Offset;
125-
return Offset;
121+
std::optional<unsigned> SemAddr =
122+
getAbsoluteAddress(GV, AMDGPUAS::LOCAL_ADDRESS);
123+
if (!SemAddr)
124+
llvm_unreachable("Semaphore should have an assigned address");
125+
Entry.first->second = SemAddr.value();
126+
recordNumSemaphores(SemAddr.value());
127+
return SemAddr.value();
126128
}
127129

128130
std::optional<uint32_t> MaybeAbs =

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,12 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
114114
LaneSharedVGPRSize = std::max(LaneSharedVGPRSize, VGPRSize);
115115
}
116116

117+
void recordNumSemaphores(uint32_t GVAddr) {
118+
unsigned OwningRank = ((GVAddr & 0xfff) >> 8);
119+
unsigned NumSems = ((GVAddr & 0xff) >> 4);
120+
NumSemaphores[OwningRank] = std::max(NumSemaphores[OwningRank], NumSems);
121+
}
122+
117123
void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt) {
118124
NumNamedBarriers =
119125
std::max(NumNamedBarriers, ((GVAddr & 0x1ff) >> 4) + BarCnt - 1);

llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,6 @@ bool isLDSVariableToLower(const GlobalVariable &GV) {
8787
if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
8888
return false;
8989
}
90-
if (isLDSSemaphore(GV))
91-
return false;
9290
if (isDynamicLDS(GV)) {
9391
return true;
9492
}
@@ -295,7 +293,7 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
295293
AMDGPU::isDynamicLDS(*GV) && DirectMapKernel.contains(Fn);
296294
if (IsDirectMapDynLDSGV)
297295
continue;
298-
if (isNamedBarrier(*GV)) {
296+
if (isNamedBarrier(*GV) || isLDSSemaphore(*GV)) {
299297
HasSpecialGVs = true;
300298
continue;
301299
}

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7373,6 +7373,17 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
73737373
return CreatedBB;
73747374
}
73757375

7376+
bool isSoffsetLegal = true;
7377+
int SoffsetIdx =
7378+
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7379+
if (SoffsetIdx != -1) {
7380+
MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7381+
if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7382+
!RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7383+
isSoffsetLegal = false;
7384+
}
7385+
}
7386+
73767387
// Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
73777388
//
73787389
// Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
@@ -7384,8 +7395,15 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
73847395
? AMDGPU::OpName::rsrc
73857396
: AMDGPU::OpName::srsrc;
73867397
MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7387-
if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7388-
CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7398+
if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
7399+
if (isSoffsetLegal) {
7400+
CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7401+
} else {
7402+
MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7403+
CreatedBB =
7404+
loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc, Soffset}, MDT);
7405+
}
7406+
}
73897407

73907408
AMDGPU::OpName SampOpName =
73917409
isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
@@ -7593,17 +7611,6 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
75937611
}
75947612

75957613
// Legalize MUBUF instructions.
7596-
bool isSoffsetLegal = true;
7597-
int SoffsetIdx =
7598-
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7599-
if (SoffsetIdx != -1) {
7600-
MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7601-
if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7602-
!RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7603-
isSoffsetLegal = false;
7604-
}
7605-
}
7606-
76077614
bool isRsrcLegal = true;
76087615
int RsrcIdx =
76097616
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);

llvm/test/CodeGen/AMDGPU/legalize-operands-store-idx.ll

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -75,25 +75,19 @@ main_body:
7575
ret void
7676
}
7777

78-
define void @vnbr(ptr addrspace(10) %itp, ptr addrspace(10) %itp_refl) {
78+
define amdgpu_kernel void @vnbr(ptr addrspace(10) %itp, ptr addrspace(10) %itp_refl) {
7979
; GFX13-LABEL: vnbr:
8080
; GFX13: ; %bb.0: ; %main_body
81-
; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0
82-
; GFX13-NEXT: s_wait_expcnt 0x0
83-
; GFX13-NEXT: s_wait_samplecnt 0x0
84-
; GFX13-NEXT: s_wait_rtscnt 0x0
85-
; GFX13-NEXT: s_wait_kmcnt 0x0
86-
; GFX13-NEXT: v_dual_lshrrev_b32 v1, 2, v1 :: v_dual_lshrrev_b32 v0, 2, v0
87-
; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
88-
; GFX13-NEXT: v_readfirstlane_b32 s0, v1
89-
; GFX13-NEXT: v_readfirstlane_b32 s1, v0
81+
; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
9082
; GFX13-NEXT: v_mov_b32_e32 v0, 0
91-
; GFX13-NEXT: s_set_gpr_idx_u32 idx2, s0
92-
; GFX13-NEXT: s_set_gpr_idx_u32 idx1, s1
83+
; GFX13-NEXT: s_wait_kmcnt 0x0
84+
; GFX13-NEXT: s_lshr_b32 s1, s1, 2
85+
; GFX13-NEXT: s_lshr_b32 s0, s0, 2
86+
; GFX13-NEXT: s_set_gpr_idx_u32 idx2, s1
87+
; GFX13-NEXT: s_set_gpr_idx_u32 idx1, s0
9388
; GFX13-NEXT: s_set_vgpr_frames 0x48 ; vsrc0_idx=0 vsrc1_idx=2 vsrc2_idx=0 vdst_idx=1 vsrc0_msb=0 vsrc1_msb=0 vsrc2_msb=0 vdst_msb=0
94-
; GFX13-NEXT: v_send_vgpr_next_b32 g1[0], g2[0], v0 sema_id:2 sema_wave_id:1 sema_id_refl:1 sema_wave_id_refl:1 wait_va_vdst:0
95-
; GFX13-NEXT: s_set_vgpr_frames 0 ; vsrc0_idx=0 vsrc1_idx=0 vsrc2_idx=0 vdst_idx=0 vsrc0_msb=0 vsrc1_msb=0 vsrc2_msb=0 vdst_msb=0
96-
; GFX13-NEXT: s_set_pc_i64 s[30:31]
89+
; GFX13-NEXT: v_send_vgpr_next_b32 g1[0], g2[0], v0 sema_id:1 sema_wave_id:1 sema_id_refl:2 sema_wave_id_refl:1 wait_va_vdst:0
90+
; GFX13-NEXT: s_endpgm
9791
main_body:
9892
call void @llvm.amdgcn.spatial.cluster.send.next(i32 0, ptr addrspace(10) %itp, ptr addrspace(3) @sem,
9993
ptr addrspace(10) %itp_refl, ptr addrspace(3) @sem2, i32 0);

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.mcast.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -943,10 +943,11 @@ define void @load_mcast_monitor_b32_saddr_scale_offset_distributed(ptr addrspace
943943
; CHECK-NEXT: s_wait_kmcnt 0x0
944944
; CHECK-NEXT: s_mov_b32 s2, s33
945945
; CHECK-NEXT: s_mov_b32 s33, s32
946+
; CHECK-NEXT: v_lshlrev_b32_e32 v2, 2, v2
946947
; CHECK-NEXT: s_mov_b32 m0, s1
947948
; CHECK-NEXT: s_set_gpr_idx_u32 idx1, 0
948949
; CHECK-NEXT: s_set_vgpr_frames 0x44 ; vsrc0_idx=0 vsrc1_idx=1 vsrc2_idx=0 vdst_idx=1 vsrc0_msb=0 vsrc1_msb=0 vsrc2_msb=0 vdst_msb=0
949-
; CHECK-NEXT: dds_load_mcast_b32 g1[0], v2, s0 scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS
950+
; CHECK-NEXT: dds_load_mcast_b32 g1[0], v2, s0 th:TH_LOAD_BYPASS scope:SCOPE_SYS
950951
; CHECK-NEXT: s_wait_loadcnt 0x0
951952
; CHECK-NEXT: global_store_b32 v[0:1], g1[0], off
952953
; CHECK-NEXT: s_mov_b32 s33, s2
@@ -1054,10 +1055,11 @@ define void @load_mcast_monitor_b64_saddr_scale_offset_distributed(ptr addrspace
10541055
; CHECK-NEXT: s_wait_kmcnt 0x0
10551056
; CHECK-NEXT: s_mov_b32 s2, s33
10561057
; CHECK-NEXT: s_mov_b32 s33, s32
1058+
; CHECK-NEXT: v_lshlrev_b32_e32 v2, 3, v2
10571059
; CHECK-NEXT: s_mov_b32 m0, s1
10581060
; CHECK-NEXT: s_set_gpr_idx_u32 idx1, 0
10591061
; CHECK-NEXT: s_set_vgpr_frames 0x44 ; vsrc0_idx=0 vsrc1_idx=1 vsrc2_idx=0 vdst_idx=1 vsrc0_msb=0 vsrc1_msb=0 vsrc2_msb=0 vdst_msb=0
1060-
; CHECK-NEXT: dds_load_mcast_b64 g1[0:1], v2, s0 scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS
1062+
; CHECK-NEXT: dds_load_mcast_b64 g1[0:1], v2, s0 th:TH_LOAD_BYPASS scope:SCOPE_SYS
10611063
; CHECK-NEXT: s_wait_loadcnt 0x0
10621064
; CHECK-NEXT: global_store_b64 v[0:1], g1[0:1], off
10631065
; CHECK-NEXT: s_mov_b32 s33, s2

0 commit comments

Comments
 (0)