From 4352a7665dfa165407302adc6b9282497ed5b714 Mon Sep 17 00:00:00 2001 From: CarolineConcatto Date: Fri, 31 Oct 2025 16:26:46 +0000 Subject: [PATCH 1/2] [LICM] Improve LICM when calls only change Inaccessible memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend `MemorySSA`’s clobber query to better distinguish calls that access inaccessible memory improving code motion opportunities in loops. If both calls dont clobber Inaccessible Memory Location it can return from instructionClobbersQuery without setting ModeRefInfo. Otherwise it relies in the default behaviour to set ModRefInfo to Read and Write. This enables LICM to hoist calls that modify inaccessible memory, improving code motion opportunities in loops. --- llvm/lib/Analysis/MemorySSA.cpp | 14 +++ .../GlobalISel/llvm.amdgcn.wqm.demote.ll | 90 ++++++++-------- .../CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll | 100 +++++++++--------- .../LICM/hoist-inaccesiblemem-call.ll | 90 ++++++++++++++++ 4 files changed, 199 insertions(+), 95 deletions(-) create mode 100644 llvm/test/Transforms/LICM/hoist-inaccesiblemem-call.ll diff --git a/llvm/lib/Analysis/MemorySSA.cpp b/llvm/lib/Analysis/MemorySSA.cpp index 0b2e3fcfd76df..336ca21d48687 100644 --- a/llvm/lib/Analysis/MemorySSA.cpp +++ b/llvm/lib/Analysis/MemorySSA.cpp @@ -277,6 +277,17 @@ static bool areLoadsReorderable(const LoadInst *Use, return !(SeqCstUse || MayClobberIsAcquire); } +bool hasInaccessibleMemoryClobber(const CallBase *CallFirst, + const CallBase *CallSecond) { + + MemoryEffects ME1 = CallFirst->getMemoryEffects(); + MemoryEffects ME2 = CallSecond->getMemoryEffects(); + if (CallFirst->onlyAccessesInaccessibleMemory() || + CallSecond->onlyAccessesInaccessibleMemory()) + return !(ME1 & ME2 & MemoryEffects::writeOnly()).onlyReadsMemory(); + return true; +} + template static bool instructionClobbersQuery(const MemoryDef *MD, const MemoryLocation &UseLoc, @@ -311,6 +322,9 @@ instructionClobbersQuery(const MemoryDef *MD, const MemoryLocation &UseLoc, } if (auto *CB = dyn_cast_or_null(UseInst)) { + if (auto *CU = dyn_cast_or_null(DefInst)) + if (!hasInaccessibleMemoryClobber(CB, CU)) + return false; ModRefInfo I = AA.getModRefInfo(DefInst, CB); return isModOrRefSet(I); } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll index 8a53c862371cf..8291058858bfa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll @@ -885,7 +885,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: s_mov_b64 s[0:1], exec ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc ; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -894,16 +894,17 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 .LBB7_9 ; SI-NEXT: ; %bb.2: ; %.demote0 -; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] -; SI-NEXT: s_and_b64 exec, exec, s[6:7] +; SI-NEXT: s_wqm_b64 s[4:5], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[4:5] ; SI-NEXT: .LBB7_3: ; %.continue0.preheader ; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: s_mov_b64 s[4:5], s[0:1] ; SI-NEXT: s_mov_b64 s[2:3], 0 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_branch .LBB7_5 ; SI-NEXT: .LBB7_4: ; %.continue1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 ; SI-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 ; SI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] @@ -911,7 +912,6 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: s_cbranch_execz .LBB7_8 ; SI-NEXT: .LBB7_5: ; %.continue0 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_mov_b64 s[4:5], s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5] ; SI-NEXT: v_mov_b32_e32 v3, v2 ; SI-NEXT: s_nop 1 @@ -920,10 +920,10 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; SI-NEXT: s_and_b64 s[4:5], s[0:1], vcc -; SI-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; SI-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SI-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; SI-NEXT: s_and_b64 s[6:7], s[4:5], vcc +; SI-NEXT: s_xor_b64 s[6:7], s[6:7], -1 +; SI-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] +; SI-NEXT: s_xor_b64 s[6:7], exec, s[8:9] ; SI-NEXT: s_cbranch_execz .LBB7_4 ; SI-NEXT: ; %bb.6: ; %.demote1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -931,8 +931,8 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: s_cbranch_scc0 .LBB7_9 ; SI-NEXT: ; %bb.7: ; %.demote1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] -; SI-NEXT: s_and_b64 exec, exec, s[6:7] +; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[8:9] ; SI-NEXT: s_branch .LBB7_4 ; SI-NEXT: .LBB7_8: ; %.return ; SI-NEXT: s_or_b64 exec, exec, s[2:3] @@ -951,7 +951,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s6, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -960,16 +960,17 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX9-NEXT: ; %bb.2: ; %.demote0 -; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[4:5] ; GFX9-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_branch .LBB7_5 ; GFX9-NEXT: .LBB7_4: ; %.continue1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: v_add_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] @@ -977,7 +978,6 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: s_cbranch_execz .LBB7_8 ; GFX9-NEXT: .LBB7_5: ; %.continue0 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-NEXT: s_nop 1 @@ -986,10 +986,10 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: s_and_b64 s[4:5], s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], -1 +; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] +; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[8:9] ; GFX9-NEXT: s_cbranch_execz .LBB7_4 ; GFX9-NEXT: ; %bb.6: ; %.demote1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -997,8 +997,8 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX9-NEXT: ; %bb.7: ; %.demote1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_wqm_b64 s[8:9], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_branch .LBB7_4 ; GFX9-NEXT: .LBB7_8: ; %.return ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] @@ -1031,10 +1031,11 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-32-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-32-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-32-NEXT: s_mov_b32 s2, s0 ; GFX10-32-NEXT: s_branch .LBB7_5 ; GFX10-32-NEXT: .LBB7_4: ; %.continue1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10-32-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, v0, v1 ; GFX10-32-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -1042,17 +1043,16 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-32-NEXT: s_cbranch_execz .LBB7_8 ; GFX10-32-NEXT: .LBB7_5: ; %.continue0 ; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-32-NEXT: s_mov_b32 s2, s0 ; GFX10-32-NEXT: v_cndmask_b32_e64 v2, v0, 0, s2 ; GFX10-32-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-32-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; GFX10-32-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX10-32-NEXT: s_and_b32 s2, s0, vcc_lo -; GFX10-32-NEXT: s_xor_b32 s2, s2, -1 -; GFX10-32-NEXT: s_and_saveexec_b32 s3, s2 -; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s3 +; GFX10-32-NEXT: s_and_b32 s3, s2, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s3, s3, -1 +; GFX10-32-NEXT: s_and_saveexec_b32 s4, s3 +; GFX10-32-NEXT: s_xor_b32 s3, exec_lo, s4 ; GFX10-32-NEXT: s_cbranch_execz .LBB7_4 ; GFX10-32-NEXT: ; %bb.6: ; %.demote1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -1060,8 +1060,8 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX10-32-NEXT: ; %bb.7: ; %.demote1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX10-32-NEXT: s_wqm_b32 s3, s0 -; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: s_wqm_b32 s4, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s4 ; GFX10-32-NEXT: s_branch .LBB7_4 ; GFX10-32-NEXT: .LBB7_8: ; %.return ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 @@ -1094,29 +1094,29 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-64-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-64-NEXT: s_mov_b64 s[2:3], 0 +; GFX10-64-NEXT: s_mov_b64 s[2:3], s[0:1] +; GFX10-64-NEXT: s_mov_b64 s[4:5], 0 ; GFX10-64-NEXT: s_branch .LBB7_5 ; GFX10-64-NEXT: .LBB7_4: ; %.continue1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX10-64-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; GFX10-64-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 -; GFX10-64-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX10-64-NEXT: s_cbranch_execz .LBB7_8 ; GFX10-64-NEXT: .LBB7_5: ; %.continue0 ; GFX10-64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-64-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX10-64-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5] +; GFX10-64-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[2:3] ; GFX10-64-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-64-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; GFX10-64-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX10-64-NEXT: s_and_b64 s[4:5], s[0:1], vcc -; GFX10-64-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GFX10-64-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GFX10-64-NEXT: s_and_b64 s[6:7], s[2:3], vcc +; GFX10-64-NEXT: s_xor_b64 s[6:7], s[6:7], -1 +; GFX10-64-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] +; GFX10-64-NEXT: s_xor_b64 s[6:7], exec, s[8:9] ; GFX10-64-NEXT: s_cbranch_execz .LBB7_4 ; GFX10-64-NEXT: ; %bb.6: ; %.demote1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -1124,11 +1124,11 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX10-64-NEXT: ; %bb.7: ; %.demote1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] -; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: s_wqm_b64 s[8:9], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[8:9] ; GFX10-64-NEXT: s_branch .LBB7_4 ; GFX10-64-NEXT: .LBB7_8: ; %.return -; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll index c98feeb96232d..499d257cf38d4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll @@ -887,7 +887,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: s_mov_b64 s[0:1], exec ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc ; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -901,31 +901,31 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: .LBB7_3: ; %.continue0.preheader ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_mov_b64 s[2:3], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[0:1] +; SI-NEXT: s_xor_b64 s[6:7], s[0:1], -1 ; SI-NEXT: s_branch .LBB7_5 ; SI-NEXT: .LBB7_4: ; %.continue1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_add_i32 s6, s6, 1 -; SI-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1 +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_add_i32 s10, s10, 1 +; SI-NEXT: v_cmp_ge_i32_e32 vcc, s10, v1 ; SI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] ; SI-NEXT: s_cbranch_execz .LBB7_8 ; SI-NEXT: .LBB7_5: ; %.continue0 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: s_mov_b64 s[4:5], s[0:1] +; SI-NEXT: v_mov_b32_e32 v0, s10 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: s_xor_b64 s[4:5], s[0:1], -1 -; SI-NEXT: s_nop 0 +; SI-NEXT: s_nop 1 ; SI-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: s_nop 1 ; SI-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; SI-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; SI-NEXT: s_xor_b64 s[4:5], exec, s[8:9] +; SI-NEXT: s_or_b64 s[8:9], s[6:7], vcc +; SI-NEXT: s_and_saveexec_b64 s[12:13], s[8:9] +; SI-NEXT: s_xor_b64 s[8:9], exec, s[12:13] ; SI-NEXT: s_cbranch_execz .LBB7_4 ; SI-NEXT: ; %bb.6: ; %.demote1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -933,8 +933,8 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: s_cbranch_scc0 .LBB7_9 ; SI-NEXT: ; %bb.7: ; %.demote1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] -; SI-NEXT: s_and_b64 exec, exec, s[8:9] +; SI-NEXT: s_wqm_b64 s[12:13], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[12:13] ; SI-NEXT: s_branch .LBB7_4 ; SI-NEXT: .LBB7_8: ; %.return ; SI-NEXT: s_or_b64 exec, exec, s[2:3] @@ -953,7 +953,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: s_mov_b32 s10, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -967,31 +967,31 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], -1 ; GFX9-NEXT: s_branch .LBB7_5 ; GFX9-NEXT: .LBB7_4: ; %.continue1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_add_i32 s6, s6, 1 -; GFX9-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1 +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_add_i32 s10, s10, 1 +; GFX9-NEXT: v_cmp_ge_i32_e32 vcc, s10, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB7_8 ; GFX9-NEXT: .LBB7_5: ; %.continue0 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s10 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_xor_b64 s[4:5], s[0:1], -1 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[8:9] +; GFX9-NEXT: s_or_b64 s[8:9], s[6:7], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[12:13], s[8:9] +; GFX9-NEXT: s_xor_b64 s[8:9], exec, s[12:13] ; GFX9-NEXT: s_cbranch_execz .LBB7_4 ; GFX9-NEXT: ; %bb.6: ; %.demote1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -999,8 +999,8 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX9-NEXT: ; %bb.7: ; %.demote1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX9-NEXT: s_wqm_b64 s[8:9], s[0:1] -; GFX9-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_wqm_b64 s[12:13], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: s_branch .LBB7_4 ; GFX9-NEXT: .LBB7_8: ; %.return ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] @@ -1032,29 +1032,29 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 ; GFX10-32-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-32-NEXT: s_mov_b32 s2, 0 +; GFX10-32-NEXT: s_mov_b32 s2, s0 +; GFX10-32-NEXT: s_xor_b32 s3, s0, -1 +; GFX10-32-NEXT: s_mov_b32 s4, 0 ; GFX10-32-NEXT: s_branch .LBB7_5 ; GFX10-32-NEXT: .LBB7_4: ; %.continue1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX10-32-NEXT: s_add_i32 s2, s2, 1 -; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, s2, v1 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-32-NEXT: s_add_i32 s4, s4, 1 +; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, s4, v1 ; GFX10-32-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: s_cbranch_execz .LBB7_8 ; GFX10-32-NEXT: .LBB7_5: ; %.continue0 ; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-32-NEXT: s_mov_b32 s3, s0 -; GFX10-32-NEXT: v_cndmask_b32_e64 v0, s2, 0, s3 -; GFX10-32-NEXT: s_xor_b32 s3, s0, -1 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, s4, 0, s2 ; GFX10-32-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-32-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX10-32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_or_b32 s3, s3, vcc_lo -; GFX10-32-NEXT: s_and_saveexec_b32 s4, s3 -; GFX10-32-NEXT: s_xor_b32 s3, exec_lo, s4 +; GFX10-32-NEXT: s_or_b32 s5, s3, vcc_lo +; GFX10-32-NEXT: s_and_saveexec_b32 s6, s5 +; GFX10-32-NEXT: s_xor_b32 s5, exec_lo, s6 ; GFX10-32-NEXT: s_cbranch_execz .LBB7_4 ; GFX10-32-NEXT: ; %bb.6: ; %.demote1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -1062,8 +1062,8 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX10-32-NEXT: ; %bb.7: ; %.demote1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX10-32-NEXT: s_wqm_b32 s4, s0 -; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s4 +; GFX10-32-NEXT: s_wqm_b32 s6, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s6 ; GFX10-32-NEXT: s_branch .LBB7_4 ; GFX10-32-NEXT: .LBB7_8: ; %.return ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 @@ -1082,7 +1082,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-64-NEXT: s_mov_b64 s[0:1], exec ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX10-64-NEXT: s_mov_b32 s6, 0 +; GFX10-64-NEXT: s_mov_b32 s10, 0 ; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -1095,29 +1095,29 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5] ; GFX10-64-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX10-64-NEXT: s_mov_b64 s[2:3], 0 +; GFX10-64-NEXT: s_xor_b64 s[6:7], s[0:1], -1 ; GFX10-64-NEXT: s_branch .LBB7_5 ; GFX10-64-NEXT: .LBB7_4: ; %.continue1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX10-64-NEXT: s_add_i32 s6, s6, 1 -; GFX10-64-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX10-64-NEXT: s_add_i32 s10, s10, 1 +; GFX10-64-NEXT: v_cmp_ge_i32_e32 vcc, s10, v1 ; GFX10-64-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: s_cbranch_execz .LBB7_8 ; GFX10-64-NEXT: .LBB7_5: ; %.continue0 ; GFX10-64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-64-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX10-64-NEXT: v_cndmask_b32_e64 v0, s6, 0, s[4:5] -; GFX10-64-NEXT: s_xor_b64 s[4:5], s[0:1], -1 +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, s10, 0, s[4:5] ; GFX10-64-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-64-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX10-64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX10-64-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[8:9] +; GFX10-64-NEXT: s_or_b64 s[8:9], s[6:7], vcc +; GFX10-64-NEXT: s_and_saveexec_b64 s[12:13], s[8:9] +; GFX10-64-NEXT: s_xor_b64 s[8:9], exec, s[12:13] ; GFX10-64-NEXT: s_cbranch_execz .LBB7_4 ; GFX10-64-NEXT: ; %bb.6: ; %.demote1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -1125,8 +1125,8 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX10-64-NEXT: ; %bb.7: ; %.demote1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX10-64-NEXT: s_wqm_b64 s[8:9], s[0:1] -; GFX10-64-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX10-64-NEXT: s_wqm_b64 s[12:13], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX10-64-NEXT: s_branch .LBB7_4 ; GFX10-64-NEXT: .LBB7_8: ; %.return ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] diff --git a/llvm/test/Transforms/LICM/hoist-inaccesiblemem-call.ll b/llvm/test/Transforms/LICM/hoist-inaccesiblemem-call.ll new file mode 100644 index 0000000000000..8b2ac7d8fdaa6 --- /dev/null +++ b/llvm/test/Transforms/LICM/hoist-inaccesiblemem-call.ll @@ -0,0 +1,90 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -aa-pipeline=basic-aa -passes='require,require,loop-mssa(licm)' < %s -S | FileCheck %s + +define void @inaccessible_hoist(ptr noalias %loc, ptr noalias %loc2){ +; CHECK-LABEL: define void @inaccessible_hoist( +; CHECK-SAME: ptr noalias [[LOC:%.*]], ptr noalias [[LOC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[LOC2]], align 4 +; CHECK-NEXT: store i32 [[VAL]], ptr [[LOC]], align 4 +; CHECK-NEXT: call void @fn_write_inaccessible_mem() +; CHECK-NEXT: call void @fn_read_inaccessible_mem() +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP:.*:]] +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: br label %[[FOR_BODY]] +; +entry: + br label %for.body +for.cond.cleanup: ; preds = %for.body + ret void +for.body: + %val = load i32, ptr %loc2 + store i32 %val, ptr %loc + call void @fn_write_inaccessible_mem() + call void @fn_read_inaccessible_mem() + br label %for.body +} + + +define void @neg_inaccessible_hoist(ptr noalias %loc, ptr noalias %loc2){ +; CHECK-LABEL: define void @neg_inaccessible_hoist( +; CHECK-SAME: ptr noalias [[LOC:%.*]], ptr noalias [[LOC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[LOC2]], align 4 +; CHECK-NEXT: store i32 [[VAL]], ptr [[LOC]], align 4 +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: call void @fn_write_inaccessible_mem() +; CHECK-NEXT: call void @fn_read_inaccessible_mem() +; CHECK-NEXT: call void @fn_readwrite_inaccessible_mem() +; CHECK-NEXT: br label %[[FOR_BODY]] +; +entry: + br label %for.body +for.body: + %val = load i32, ptr %loc2 + store i32 %val, ptr %loc + call void @fn_write_inaccessible_mem() + call void @fn_read_inaccessible_mem() + call void @fn_readwrite_inaccessible_mem() + br label %for.body +} + + +; Nothing should be hoisted from the loop because volatile +; sets inaccessible memory to read write +define void @neg_volatile(ptr %loc, ptr %loc2) { +; CHECK-LABEL: define void @neg_volatile( +; CHECK-SAME: ptr [[LOC:%.*]], ptr [[LOC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: store volatile i32 0, ptr [[LOC]], align 4 +; CHECK-NEXT: call void @fn_write_inaccessible_mem() +; CHECK-NEXT: call void @fn_read_inaccessible_mem() +; CHECK-NEXT: br label %[[LOOP]] +; +entry: + br label %loop + +loop: + %val = load i32, ptr %loc2 + store volatile i32 0, ptr %loc + call void @fn_write_inaccessible_mem() + call void @fn_read_inaccessible_mem() + br label %loop +} + +declare void @fn_write_inaccessible_mem()#0 + memory(inaccessiblemem: write) + +declare void @fn_read_inaccessible_mem()#0 + memory(inaccessiblemem: read) + +declare void @fn_readwrite_inaccessible_mem()#0 + memory(inaccessiblemem: readwrite) + +; Needs to set nounwind because of doesNotThrow +attributes #0 = { mustprogress nofree norecurse nosync nounwind} From 5f632c3961bdd3fe6e20632e911b1c43c6a0d626 Mon Sep 17 00:00:00 2001 From: CarolineConcatto Date: Thu, 27 Nov 2025 16:01:22 +0000 Subject: [PATCH 2/2] Copy the hasInaccessibleMemoryClobber to instructionClobbersQuery --- llvm/lib/Analysis/MemorySSA.cpp | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Analysis/MemorySSA.cpp b/llvm/lib/Analysis/MemorySSA.cpp index 336ca21d48687..d81822ac75777 100644 --- a/llvm/lib/Analysis/MemorySSA.cpp +++ b/llvm/lib/Analysis/MemorySSA.cpp @@ -277,17 +277,6 @@ static bool areLoadsReorderable(const LoadInst *Use, return !(SeqCstUse || MayClobberIsAcquire); } -bool hasInaccessibleMemoryClobber(const CallBase *CallFirst, - const CallBase *CallSecond) { - - MemoryEffects ME1 = CallFirst->getMemoryEffects(); - MemoryEffects ME2 = CallSecond->getMemoryEffects(); - if (CallFirst->onlyAccessesInaccessibleMemory() || - CallSecond->onlyAccessesInaccessibleMemory()) - return !(ME1 & ME2 & MemoryEffects::writeOnly()).onlyReadsMemory(); - return true; -} - template static bool instructionClobbersQuery(const MemoryDef *MD, const MemoryLocation &UseLoc, @@ -322,9 +311,14 @@ instructionClobbersQuery(const MemoryDef *MD, const MemoryLocation &UseLoc, } if (auto *CB = dyn_cast_or_null(UseInst)) { - if (auto *CU = dyn_cast_or_null(DefInst)) - if (!hasInaccessibleMemoryClobber(CB, CU)) - return false; + if (auto *CU = dyn_cast_or_null(DefInst)) { + MemoryEffects CBME = CB->getMemoryEffects(); + MemoryEffects CUME = CU->getMemoryEffects(); + if (CBME.onlyAccessesInaccessibleMem() || + CUME.onlyAccessesInaccessibleMem()) + if ((CBME & CUME & MemoryEffects::writeOnly()).onlyReadsMemory()) + return false; + } ModRefInfo I = AA.getModRefInfo(DefInst, CB); return isModOrRefSet(I); }