Skip to content

Commit d560769

Browse files
authored
[AMDGPU][SDAG] DAGCombine PTRADD -> disjoint OR (#146075)
If we can't fold a PTRADD's offset into its users, lowering them to disjoint ORs is preferable: Often, a 32-bit OR instruction suffices where we'd otherwise use a pair of 32-bit additions with carry. This needs to be a DAGCombine (and not a selection rule) because its main purpose is to enable subsequent DAGCombines for bitwise operations. We don't want to just turn PTRADDs into disjoint ORs whenever that's sound because this transform loses the information that the operation implements pointer arithmetic, which AMDGPU for instance needs when folding constant offsets. For SWDEV-516125.
1 parent 91dba22 commit d560769

File tree

2 files changed

+66
-1
lines changed

2 files changed

+66
-1
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2774,6 +2774,19 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) {
27742774
}
27752775
}
27762776

2777+
// Transform (ptradd a, b) -> (or disjoint a, b) if it is equivalent and if
2778+
// that transformation can't block an offset folding at any use of the ptradd.
2779+
// This should be done late, after legalization, so that it doesn't block
2780+
// other ptradd combines that could enable more offset folding.
2781+
if (LegalOperations && DAG.haveNoCommonBitsSet(N0, N1)) {
2782+
bool TransformCannotBreakAddrMode = none_of(N->users(), [&](SDNode *User) {
2783+
return canFoldInAddressingMode(N, User, DAG, TLI);
2784+
});
2785+
2786+
if (TransformCannotBreakAddrMode)
2787+
return DAG.getNode(ISD::OR, DL, PtrVT, N0, N1, SDNodeFlags::Disjoint);
2788+
}
2789+
27772790
return SDValue();
27782791
}
27792792

llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ define void @baseptr_null(i64 %offset, i8 %v) {
100100

101101
; Taken from implicit-kernarg-backend-usage.ll, tests the PTRADD handling in the
102102
; assertalign DAG combine.
103-
define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 {
103+
define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
104104
; GFX942-LABEL: llvm_amdgcn_queue_ptr:
105105
; GFX942: ; %bb.0:
106106
; GFX942-NEXT: v_mov_b32_e32 v0, 0
@@ -415,6 +415,58 @@ entry:
415415
ret void
416416
}
417417

418+
; Check that ptradds can be lowered to disjoint ORs.
419+
define ptr @gep_disjoint_or(ptr %base) {
420+
; GFX942-LABEL: gep_disjoint_or:
421+
; GFX942: ; %bb.0:
422+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
423+
; GFX942-NEXT: v_and_or_b32 v0, v0, -16, 4
424+
; GFX942-NEXT: s_setpc_b64 s[30:31]
425+
%p = call ptr @llvm.ptrmask(ptr %base, i64 s0xf0)
426+
%gep = getelementptr nuw inbounds i8, ptr %p, i64 4
427+
ret ptr %gep
428+
}
429+
430+
; Check that AssertAlign nodes between ptradd nodes don't block offset folding,
431+
; taken from preload-implicit-kernargs.ll
432+
define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) {
433+
; GFX942_PTRADD-LABEL: random_incorrect_offset:
434+
; GFX942_PTRADD: ; %bb.1:
435+
; GFX942_PTRADD-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
436+
; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
437+
; GFX942_PTRADD-NEXT: s_branch .LBB21_0
438+
; GFX942_PTRADD-NEXT: .p2align 8
439+
; GFX942_PTRADD-NEXT: ; %bb.2:
440+
; GFX942_PTRADD-NEXT: .LBB21_0:
441+
; GFX942_PTRADD-NEXT: s_load_dword s0, s[4:5], 0xa
442+
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, 0
443+
; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
444+
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s0
445+
; GFX942_PTRADD-NEXT: global_store_dword v0, v1, s[8:9]
446+
; GFX942_PTRADD-NEXT: s_endpgm
447+
;
448+
; GFX942_LEGACY-LABEL: random_incorrect_offset:
449+
; GFX942_LEGACY: ; %bb.1:
450+
; GFX942_LEGACY-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
451+
; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
452+
; GFX942_LEGACY-NEXT: s_branch .LBB21_0
453+
; GFX942_LEGACY-NEXT: .p2align 8
454+
; GFX942_LEGACY-NEXT: ; %bb.2:
455+
; GFX942_LEGACY-NEXT: .LBB21_0:
456+
; GFX942_LEGACY-NEXT: s_mov_b32 s0, 8
457+
; GFX942_LEGACY-NEXT: s_load_dword s0, s[4:5], s0 offset:0x2
458+
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, 0
459+
; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
460+
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s0
461+
; GFX942_LEGACY-NEXT: global_store_dword v0, v1, s[8:9]
462+
; GFX942_LEGACY-NEXT: s_endpgm
463+
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
464+
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2
465+
%load = load i32, ptr addrspace(4) %gep
466+
store i32 %load, ptr addrspace(1) %out
467+
ret void
468+
}
469+
418470
declare void @llvm.memcpy.p0.p4.i64(ptr noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg)
419471

420472
!0 = !{}

0 commit comments

Comments
 (0)