Skip to content

Commit d3c3c6b

Browse files
authored
AMDGPU: Fix treating divergent loads as uniform (llvm#168785)
Avoids regression which caused the revert 6d5f87f. This is a hack on a hack. We currently have isUniformMMO, which improperly treats unknown source value as known uniform. This is hack from before we had divergence information in the DAG, and should be removed. This is the minimum change to avoid the regression; removing the aggressive handling of the unknown case (or dropping isUniformMMO entirely) are more involved fixes.
1 parent 53b2697 commit d3c3c6b

File tree

3 files changed

+96
-3
lines changed

3 files changed

+96
-3
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4420,10 +4420,18 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
44204420

44214421
bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
44224422
const auto *Ld = cast<LoadSDNode>(N);
4423-
44244423
const MachineMemOperand *MMO = Ld->getMemOperand();
4425-
if (N->isDivergent() && !AMDGPU::isUniformMMO(MMO))
4426-
return false;
4424+
4425+
if (Ld->isDivergent()) {
4426+
// FIXME: We ought to able able to take the direct isDivergent result. We
4427+
// cannot rely on the MMO for a uniformity check, and should stop using
4428+
// it. This is a hack for 2 ways that the IR divergence analysis is superior
4429+
// to the DAG divergence: Recognizing shift-of-workitem-id as always
4430+
// uniform, and isSingleLaneExecution. These should be handled in the DAG
4431+
// version, and then this can be dropped.
4432+
if (!MMO->getValue() || !AMDGPU::isUniformMMO(MMO))
4433+
return false;
4434+
}
44274435

44284436
return MMO->getSize().hasValue() &&
44294437
Ld->getAlign() >=

llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ Intrinsic::ID AMDGPU::getIntrinsicID(const MachineInstr &I) {
2828

2929
// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
3030
bool AMDGPU::isUniformMMO(const MachineMemOperand *MMO) {
31+
// FIXME: null value is should be treated as unknown, not as uniform.
3132
const Value *Ptr = MMO->getValue();
3233
// UndefValue means this is a load of a kernel input. These are uniform.
3334
// Sometimes LDS instructions have constant pointers.

llvm/test/CodeGen/AMDGPU/load-select-ptr.ll

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,3 +139,87 @@ define amdgpu_kernel void @select_ptr_crash_i64_local_offsets(i32 %tmp, ptr addr
139139
store i64 %tmp5, ptr addrspace(1) %ptr2, align 8
140140
ret void
141141
}
142+
143+
; The resultant load cannot be treated as uniform
144+
define amdgpu_kernel void @sample_test(ptr addrspace(1) %dest, ptr addrspace(1) %sourceA, ptr addrspace(1) %sourceB, i1 %tobool.not.i) #0 {
145+
; GCN-LABEL: sample_test:
146+
; GCN: ; %bb.0: ; %entry
147+
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
148+
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
149+
; GCN-NEXT: v_lshlrev_b32_e32 v2, 3, v0
150+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
151+
; GCN-NEXT: v_mov_b32_e32 v1, s3
152+
; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v2
153+
; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
154+
; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
155+
; GCN-NEXT: s_load_dword s2, s[4:5], 0x18
156+
; GCN-NEXT: v_mov_b32_e32 v3, s1
157+
; GCN-NEXT: v_add_u32_e32 v2, vcc, s0, v2
158+
; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
159+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
160+
; GCN-NEXT: s_bitcmp1_b32 s2, 0
161+
; GCN-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
162+
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
163+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
164+
; GCN-NEXT: v_mov_b32_e32 v4, s3
165+
; GCN-NEXT: v_mov_b32_e32 v5, s2
166+
; GCN-NEXT: s_waitcnt vmcnt(0)
167+
; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
168+
; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
169+
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
170+
; GCN-NEXT: s_endpgm
171+
entry:
172+
%0 = tail call i32 @llvm.amdgcn.workitem.id.x()
173+
%conv2.i.i.i1 = zext i32 %0 to i64
174+
%arrayidx.i = getelementptr i64, ptr addrspace(1) %sourceA, i64 %conv2.i.i.i1
175+
%dest.gep = getelementptr i64, ptr addrspace(1) %dest, i64 %conv2.i.i.i1
176+
%ld0 = load i64, ptr addrspace(1) %arrayidx.i, align 8, !amdgpu.noclobber !0
177+
%ld1 = load i64, ptr addrspace(1) %sourceB, align 8
178+
%cond.i = select i1 %tobool.not.i, i64 %ld0, i64 %ld1
179+
store i64 %cond.i, ptr addrspace(1) %dest.gep, align 8
180+
ret void
181+
}
182+
183+
; The resultant load cannot be treated as uniform
184+
define amdgpu_kernel void @constant_is_not_uniform(ptr addrspace(1) %dest, ptr addrspace(4) %sourceA, ptr addrspace(4) %sourceB, i1 %tobool.not.i) #0 {
185+
; GCN-LABEL: constant_is_not_uniform:
186+
; GCN: ; %bb.0: ; %entry
187+
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
188+
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
189+
; GCN-NEXT: v_lshlrev_b32_e32 v2, 3, v0
190+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
191+
; GCN-NEXT: v_mov_b32_e32 v1, s3
192+
; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v2
193+
; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
194+
; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
195+
; GCN-NEXT: s_load_dword s2, s[4:5], 0x18
196+
; GCN-NEXT: v_mov_b32_e32 v3, s1
197+
; GCN-NEXT: v_add_u32_e32 v2, vcc, s0, v2
198+
; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
199+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
200+
; GCN-NEXT: s_bitcmp1_b32 s2, 0
201+
; GCN-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
202+
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
203+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
204+
; GCN-NEXT: v_mov_b32_e32 v4, s3
205+
; GCN-NEXT: v_mov_b32_e32 v5, s2
206+
; GCN-NEXT: s_waitcnt vmcnt(0)
207+
; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
208+
; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
209+
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
210+
; GCN-NEXT: s_endpgm
211+
entry:
212+
%0 = tail call i32 @llvm.amdgcn.workitem.id.x()
213+
%conv2.i.i.i1 = zext i32 %0 to i64
214+
%arrayidx.i = getelementptr i64, ptr addrspace(4) %sourceA, i64 %conv2.i.i.i1
215+
%dest.gep = getelementptr i64, ptr addrspace(1) %dest, i64 %conv2.i.i.i1
216+
%ld0 = load i64, ptr addrspace(4) %arrayidx.i, align 8
217+
%ld1 = load i64, ptr addrspace(4) %sourceB, align 8
218+
%cond.i = select i1 %tobool.not.i, i64 %ld0, i64 %ld1
219+
store i64 %cond.i, ptr addrspace(1) %dest.gep, align 8
220+
ret void
221+
}
222+
223+
attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
224+
225+
!0 = !{}

0 commit comments

Comments
 (0)