Skip to content

Commit 65ffa2b

Browse files
committed
[AMDGPU][SDAG] DAGCombine PTRADD -> disjoint OR
If we can't fold a PTRADD's offset into its users, lowering them to disjoint ORs is preferable: Often, a 32-bit OR instruction suffices where we'd otherwise use a pair of 32-bit additions with carry. This needs to be a DAGCombine (and not a selection rule) because its main purpose is to enable subsequent DAGCombines for bitwise operations. We don't want to just turn PTRADDs into disjoint ORs whenever that's sound because this transform loses the information that the operation implements pointer arithmetic, which we will soon need to fold offsets into FLAT instructions. Currently, disjoint ORs can still be used for offset folding, so that part of the logic can't be tested. The PR contains a hacky workaround for a situation where an AssertAlign operand of a PTRADD is not DAGCombined before the PTRADD, causing the PTRADD to be turned into a disjoint OR although reassociating it with the operand of the AssertAlign would be better. This wouldn't be a problem if the DAGCombiner ensured that a node is only processed after all its operands have been processed. For SWDEV-516125.
1 parent a8fdd2e commit 65ffa2b

File tree

2 files changed

+90
-1
lines changed

2 files changed

+90
-1
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15370,6 +15370,41 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
1537015370
return Folded;
1537115371
}
1537215372

15373+
// Transform (ptradd a, b) -> (or disjoint a, b) if it is equivalent and if
15374+
// that transformation can't block an offset folding at any use of the ptradd.
15375+
// This should be done late, after legalization, so that it doesn't block
15376+
// other ptradd combines that could enable more offset folding.
15377+
bool HasIntermediateAssertAlign =
15378+
N0->getOpcode() == ISD::AssertAlign && N0->getOperand(0)->isAnyAdd();
15379+
// This is a hack to work around an ordering problem for DAGs like this:
15380+
// (ptradd (AssertAlign (ptradd p, c1), k), c2)
15381+
// If the outer ptradd is handled first by the DAGCombiner, it can be
15382+
// transformed into a disjoint or. Then, when the generic AssertAlign combine
15383+
// pushes the AssertAlign through the inner ptradd, it's too late for the
15384+
// ptradd reassociation to trigger.
15385+
if (!DCI.isBeforeLegalizeOps() && !HasIntermediateAssertAlign &&
15386+
DAG.haveNoCommonBitsSet(N0, N1)) {
15387+
bool TransformCanBreakAddrMode = any_of(N->users(), [&](SDNode *User) {
15388+
if (auto *LoadStore = dyn_cast<MemSDNode>(User);
15389+
LoadStore && LoadStore->getBasePtr().getNode() == N) {
15390+
unsigned AS = LoadStore->getAddressSpace();
15391+
// Currently, we only really need ptradds to fold offsets into flat
15392+
// memory instructions.
15393+
if (AS != AMDGPUAS::FLAT_ADDRESS)
15394+
return false;
15395+
TargetLoweringBase::AddrMode AM;
15396+
AM.HasBaseReg = true;
15397+
EVT VT = LoadStore->getMemoryVT();
15398+
Type *AccessTy = VT.getTypeForEVT(*DAG.getContext());
15399+
return isLegalAddressingMode(DAG.getDataLayout(), AM, AccessTy, AS);
15400+
}
15401+
return false;
15402+
});
15403+
15404+
if (!TransformCanBreakAddrMode)
15405+
return DAG.getNode(ISD::OR, DL, VT, N0, N1, SDNodeFlags::Disjoint);
15406+
}
15407+
1537315408
if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
1537415409
return SDValue();
1537515410

llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ define void @baseptr_null(i64 %offset, i8 %v) {
100100

101101
; Taken from implicit-kernarg-backend-usage.ll, tests the PTRADD handling in the
102102
; assertalign DAG combine.
103-
define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 {
103+
define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
104104
; GFX942-LABEL: llvm_amdgcn_queue_ptr:
105105
; GFX942: ; %bb.0:
106106
; GFX942-NEXT: v_mov_b32_e32 v2, 0
@@ -416,6 +416,60 @@ entry:
416416
ret void
417417
}
418418

419+
; Check that ptradds can be lowered to disjoint ORs.
420+
define ptr @gep_disjoint_or(ptr %base) {
421+
; GFX942-LABEL: gep_disjoint_or:
422+
; GFX942: ; %bb.0:
423+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
424+
; GFX942-NEXT: v_and_or_b32 v0, v0, -16, 4
425+
; GFX942-NEXT: s_setpc_b64 s[30:31]
426+
%p = call ptr @llvm.ptrmask(ptr %base, i64 s0xf0)
427+
%gep = getelementptr nuw inbounds i8, ptr %p, i64 4
428+
ret ptr %gep
429+
}
430+
431+
; Check that AssertAlign nodes between ptradd nodes don't block offset folding,
432+
; taken from preload-implicit-kernargs.ll
433+
define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) #0 {
434+
; GFX942_PTRADD-LABEL: random_incorrect_offset:
435+
; GFX942_PTRADD: ; %bb.1:
436+
; GFX942_PTRADD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
437+
; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
438+
; GFX942_PTRADD-NEXT: s_branch .LBB21_0
439+
; GFX942_PTRADD-NEXT: .p2align 8
440+
; GFX942_PTRADD-NEXT: ; %bb.2:
441+
; GFX942_PTRADD-NEXT: .LBB21_0:
442+
; GFX942_PTRADD-NEXT: s_load_dword s0, s[0:1], 0xa
443+
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, 0
444+
; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
445+
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s0
446+
; GFX942_PTRADD-NEXT: global_store_dword v0, v1, s[2:3]
447+
; GFX942_PTRADD-NEXT: s_endpgm
448+
;
449+
; GFX942_LEGACY-LABEL: random_incorrect_offset:
450+
; GFX942_LEGACY: ; %bb.1:
451+
; GFX942_LEGACY-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
452+
; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
453+
; GFX942_LEGACY-NEXT: s_branch .LBB21_0
454+
; GFX942_LEGACY-NEXT: .p2align 8
455+
; GFX942_LEGACY-NEXT: ; %bb.2:
456+
; GFX942_LEGACY-NEXT: .LBB21_0:
457+
; GFX942_LEGACY-NEXT: s_mov_b32 s4, 8
458+
; GFX942_LEGACY-NEXT: s_load_dword s0, s[0:1], s4 offset:0x2
459+
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, 0
460+
; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
461+
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s0
462+
; GFX942_LEGACY-NEXT: global_store_dword v0, v1, s[2:3]
463+
; GFX942_LEGACY-NEXT: s_endpgm
464+
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
465+
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2
466+
%load = load i32, ptr addrspace(4) %gep
467+
store i32 %load, ptr addrspace(1) %out
468+
ret void
469+
}
470+
419471
declare void @llvm.memcpy.p0.p4.i64(ptr noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg)
420472

421473
!0 = !{}
474+
475+
attributes #0 = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }

0 commit comments

Comments
 (0)