Skip to content

Commit 4352a76

Browse files
[LICM] Improve LICM when calls only change Inaccessible memory
Extend `MemorySSA`’s clobber query to better distinguish calls that access inaccessible memory improving code motion opportunities in loops. If both calls dont clobber Inaccessible Memory Location it can return from instructionClobbersQuery without setting ModeRefInfo. Otherwise it relies in the default behaviour to set ModRefInfo to Read and Write. This enables LICM to hoist calls that modify inaccessible memory, improving code motion opportunities in loops.
1 parent bc4143b commit 4352a76

File tree

4 files changed

+199
-95
lines changed

4 files changed

+199
-95
lines changed

llvm/lib/Analysis/MemorySSA.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,17 @@ static bool areLoadsReorderable(const LoadInst *Use,
277277
return !(SeqCstUse || MayClobberIsAcquire);
278278
}
279279

280+
bool hasInaccessibleMemoryClobber(const CallBase *CallFirst,
281+
const CallBase *CallSecond) {
282+
283+
MemoryEffects ME1 = CallFirst->getMemoryEffects();
284+
MemoryEffects ME2 = CallSecond->getMemoryEffects();
285+
if (CallFirst->onlyAccessesInaccessibleMemory() ||
286+
CallSecond->onlyAccessesInaccessibleMemory())
287+
return !(ME1 & ME2 & MemoryEffects::writeOnly()).onlyReadsMemory();
288+
return true;
289+
}
290+
280291
template <typename AliasAnalysisType>
281292
static bool
282293
instructionClobbersQuery(const MemoryDef *MD, const MemoryLocation &UseLoc,
@@ -311,6 +322,9 @@ instructionClobbersQuery(const MemoryDef *MD, const MemoryLocation &UseLoc,
311322
}
312323

313324
if (auto *CB = dyn_cast_or_null<CallBase>(UseInst)) {
325+
if (auto *CU = dyn_cast_or_null<CallBase>(DefInst))
326+
if (!hasInaccessibleMemoryClobber(CB, CU))
327+
return false;
314328
ModRefInfo I = AA.getModRefInfo(DefInst, CB);
315329
return isModOrRefSet(I);
316330
}

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll

Lines changed: 45 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -885,7 +885,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
885885
; SI-NEXT: s_mov_b64 s[0:1], exec
886886
; SI-NEXT: s_wqm_b64 exec, exec
887887
; SI-NEXT: v_cvt_i32_f32_e32 v0, v0
888-
; SI-NEXT: s_mov_b32 s4, 0
888+
; SI-NEXT: s_mov_b32 s6, 0
889889
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
890890
; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc
891891
; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
@@ -894,24 +894,24 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
894894
; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
895895
; SI-NEXT: s_cbranch_scc0 .LBB7_9
896896
; SI-NEXT: ; %bb.2: ; %.demote0
897-
; SI-NEXT: s_wqm_b64 s[6:7], s[0:1]
898-
; SI-NEXT: s_and_b64 exec, exec, s[6:7]
897+
; SI-NEXT: s_wqm_b64 s[4:5], s[0:1]
898+
; SI-NEXT: s_and_b64 exec, exec, s[4:5]
899899
; SI-NEXT: .LBB7_3: ; %.continue0.preheader
900900
; SI-NEXT: s_or_b64 exec, exec, s[2:3]
901+
; SI-NEXT: s_mov_b64 s[4:5], s[0:1]
901902
; SI-NEXT: s_mov_b64 s[2:3], 0
902-
; SI-NEXT: v_mov_b32_e32 v0, s4
903+
; SI-NEXT: v_mov_b32_e32 v0, s6
903904
; SI-NEXT: s_branch .LBB7_5
904905
; SI-NEXT: .LBB7_4: ; %.continue1
905906
; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1
906-
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
907+
; SI-NEXT: s_or_b64 exec, exec, s[6:7]
907908
; SI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
908909
; SI-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1
909910
; SI-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
910911
; SI-NEXT: s_andn2_b64 exec, exec, s[2:3]
911912
; SI-NEXT: s_cbranch_execz .LBB7_8
912913
; SI-NEXT: .LBB7_5: ; %.continue0
913914
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
914-
; SI-NEXT: s_mov_b64 s[4:5], s[0:1]
915915
; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5]
916916
; SI-NEXT: v_mov_b32_e32 v3, v2
917917
; SI-NEXT: s_nop 1
@@ -920,19 +920,19 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
920920
; SI-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
921921
; SI-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
922922
; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
923-
; SI-NEXT: s_and_b64 s[4:5], s[0:1], vcc
924-
; SI-NEXT: s_xor_b64 s[4:5], s[4:5], -1
925-
; SI-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
926-
; SI-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
923+
; SI-NEXT: s_and_b64 s[6:7], s[4:5], vcc
924+
; SI-NEXT: s_xor_b64 s[6:7], s[6:7], -1
925+
; SI-NEXT: s_and_saveexec_b64 s[8:9], s[6:7]
926+
; SI-NEXT: s_xor_b64 s[6:7], exec, s[8:9]
927927
; SI-NEXT: s_cbranch_execz .LBB7_4
928928
; SI-NEXT: ; %bb.6: ; %.demote1
929929
; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1
930930
; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
931931
; SI-NEXT: s_cbranch_scc0 .LBB7_9
932932
; SI-NEXT: ; %bb.7: ; %.demote1
933933
; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1
934-
; SI-NEXT: s_wqm_b64 s[6:7], s[0:1]
935-
; SI-NEXT: s_and_b64 exec, exec, s[6:7]
934+
; SI-NEXT: s_wqm_b64 s[8:9], s[0:1]
935+
; SI-NEXT: s_and_b64 exec, exec, s[8:9]
936936
; SI-NEXT: s_branch .LBB7_4
937937
; SI-NEXT: .LBB7_8: ; %.return
938938
; SI-NEXT: s_or_b64 exec, exec, s[2:3]
@@ -951,7 +951,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
951951
; GFX9-NEXT: s_mov_b64 s[0:1], exec
952952
; GFX9-NEXT: s_wqm_b64 exec, exec
953953
; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
954-
; GFX9-NEXT: s_mov_b32 s4, 0
954+
; GFX9-NEXT: s_mov_b32 s6, 0
955955
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
956956
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
957957
; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
@@ -960,24 +960,24 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
960960
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
961961
; GFX9-NEXT: s_cbranch_scc0 .LBB7_9
962962
; GFX9-NEXT: ; %bb.2: ; %.demote0
963-
; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1]
964-
; GFX9-NEXT: s_and_b64 exec, exec, s[6:7]
963+
; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1]
964+
; GFX9-NEXT: s_and_b64 exec, exec, s[4:5]
965965
; GFX9-NEXT: .LBB7_3: ; %.continue0.preheader
966966
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
967+
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
967968
; GFX9-NEXT: s_mov_b64 s[2:3], 0
968-
; GFX9-NEXT: v_mov_b32_e32 v0, s4
969+
; GFX9-NEXT: v_mov_b32_e32 v0, s6
969970
; GFX9-NEXT: s_branch .LBB7_5
970971
; GFX9-NEXT: .LBB7_4: ; %.continue1
971972
; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1
972-
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
973+
; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
973974
; GFX9-NEXT: v_add_u32_e32 v0, 1, v0
974975
; GFX9-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1
975976
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
976977
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
977978
; GFX9-NEXT: s_cbranch_execz .LBB7_8
978979
; GFX9-NEXT: .LBB7_5: ; %.continue0
979980
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
980-
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
981981
; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5]
982982
; GFX9-NEXT: v_mov_b32_e32 v3, v2
983983
; GFX9-NEXT: s_nop 1
@@ -986,19 +986,19 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
986986
; GFX9-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
987987
; GFX9-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
988988
; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
989-
; GFX9-NEXT: s_and_b64 s[4:5], s[0:1], vcc
990-
; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], -1
991-
; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
992-
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
989+
; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], vcc
990+
; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], -1
991+
; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[6:7]
992+
; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[8:9]
993993
; GFX9-NEXT: s_cbranch_execz .LBB7_4
994994
; GFX9-NEXT: ; %bb.6: ; %.demote1
995995
; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1
996996
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
997997
; GFX9-NEXT: s_cbranch_scc0 .LBB7_9
998998
; GFX9-NEXT: ; %bb.7: ; %.demote1
999999
; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1
1000-
; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1]
1001-
; GFX9-NEXT: s_and_b64 exec, exec, s[6:7]
1000+
; GFX9-NEXT: s_wqm_b64 s[8:9], s[0:1]
1001+
; GFX9-NEXT: s_and_b64 exec, exec, s[8:9]
10021002
; GFX9-NEXT: s_branch .LBB7_4
10031003
; GFX9-NEXT: .LBB7_8: ; %.return
10041004
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
@@ -1031,37 +1031,37 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
10311031
; GFX10-32-NEXT: .LBB7_3: ; %.continue0.preheader
10321032
; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2
10331033
; GFX10-32-NEXT: v_mov_b32_e32 v0, s1
1034+
; GFX10-32-NEXT: s_mov_b32 s2, s0
10341035
; GFX10-32-NEXT: s_branch .LBB7_5
10351036
; GFX10-32-NEXT: .LBB7_4: ; %.continue1
10361037
; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1
1037-
; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2
1038+
; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s3
10381039
; GFX10-32-NEXT: v_add_nc_u32_e32 v0, 1, v0
10391040
; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, v0, v1
10401041
; GFX10-32-NEXT: s_or_b32 s1, vcc_lo, s1
10411042
; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s1
10421043
; GFX10-32-NEXT: s_cbranch_execz .LBB7_8
10431044
; GFX10-32-NEXT: .LBB7_5: ; %.continue0
10441045
; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1
1045-
; GFX10-32-NEXT: s_mov_b32 s2, s0
10461046
; GFX10-32-NEXT: v_cndmask_b32_e64 v2, v0, 0, s2
10471047
; GFX10-32-NEXT: v_mov_b32_e32 v3, v2
10481048
; GFX10-32-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
10491049
; GFX10-32-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
10501050
; GFX10-32-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
10511051
; GFX10-32-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
1052-
; GFX10-32-NEXT: s_and_b32 s2, s0, vcc_lo
1053-
; GFX10-32-NEXT: s_xor_b32 s2, s2, -1
1054-
; GFX10-32-NEXT: s_and_saveexec_b32 s3, s2
1055-
; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s3
1052+
; GFX10-32-NEXT: s_and_b32 s3, s2, vcc_lo
1053+
; GFX10-32-NEXT: s_xor_b32 s3, s3, -1
1054+
; GFX10-32-NEXT: s_and_saveexec_b32 s4, s3
1055+
; GFX10-32-NEXT: s_xor_b32 s3, exec_lo, s4
10561056
; GFX10-32-NEXT: s_cbranch_execz .LBB7_4
10571057
; GFX10-32-NEXT: ; %bb.6: ; %.demote1
10581058
; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1
10591059
; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo
10601060
; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_9
10611061
; GFX10-32-NEXT: ; %bb.7: ; %.demote1
10621062
; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1
1063-
; GFX10-32-NEXT: s_wqm_b32 s3, s0
1064-
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3
1063+
; GFX10-32-NEXT: s_wqm_b32 s4, s0
1064+
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s4
10651065
; GFX10-32-NEXT: s_branch .LBB7_4
10661066
; GFX10-32-NEXT: .LBB7_8: ; %.return
10671067
; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1
@@ -1094,41 +1094,41 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
10941094
; GFX10-64-NEXT: .LBB7_3: ; %.continue0.preheader
10951095
; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3]
10961096
; GFX10-64-NEXT: v_mov_b32_e32 v0, s4
1097-
; GFX10-64-NEXT: s_mov_b64 s[2:3], 0
1097+
; GFX10-64-NEXT: s_mov_b64 s[2:3], s[0:1]
1098+
; GFX10-64-NEXT: s_mov_b64 s[4:5], 0
10981099
; GFX10-64-NEXT: s_branch .LBB7_5
10991100
; GFX10-64-NEXT: .LBB7_4: ; %.continue1
11001101
; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1
1101-
; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5]
1102+
; GFX10-64-NEXT: s_or_b64 exec, exec, s[6:7]
11021103
; GFX10-64-NEXT: v_add_nc_u32_e32 v0, 1, v0
11031104
; GFX10-64-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1
1104-
; GFX10-64-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1105-
; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[2:3]
1105+
; GFX10-64-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1106+
; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[4:5]
11061107
; GFX10-64-NEXT: s_cbranch_execz .LBB7_8
11071108
; GFX10-64-NEXT: .LBB7_5: ; %.continue0
11081109
; GFX10-64-NEXT: ; =>This Inner Loop Header: Depth=1
1109-
; GFX10-64-NEXT: s_mov_b64 s[4:5], s[0:1]
1110-
; GFX10-64-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5]
1110+
; GFX10-64-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[2:3]
11111111
; GFX10-64-NEXT: v_mov_b32_e32 v3, v2
11121112
; GFX10-64-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
11131113
; GFX10-64-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
11141114
; GFX10-64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
11151115
; GFX10-64-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
1116-
; GFX10-64-NEXT: s_and_b64 s[4:5], s[0:1], vcc
1117-
; GFX10-64-NEXT: s_xor_b64 s[4:5], s[4:5], -1
1118-
; GFX10-64-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
1119-
; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
1116+
; GFX10-64-NEXT: s_and_b64 s[6:7], s[2:3], vcc
1117+
; GFX10-64-NEXT: s_xor_b64 s[6:7], s[6:7], -1
1118+
; GFX10-64-NEXT: s_and_saveexec_b64 s[8:9], s[6:7]
1119+
; GFX10-64-NEXT: s_xor_b64 s[6:7], exec, s[8:9]
11201120
; GFX10-64-NEXT: s_cbranch_execz .LBB7_4
11211121
; GFX10-64-NEXT: ; %bb.6: ; %.demote1
11221122
; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1
11231123
; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
11241124
; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_9
11251125
; GFX10-64-NEXT: ; %bb.7: ; %.demote1
11261126
; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1
1127-
; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1]
1128-
; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7]
1127+
; GFX10-64-NEXT: s_wqm_b64 s[8:9], s[0:1]
1128+
; GFX10-64-NEXT: s_and_b64 exec, exec, s[8:9]
11291129
; GFX10-64-NEXT: s_branch .LBB7_4
11301130
; GFX10-64-NEXT: .LBB7_8: ; %.return
1131-
; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3]
1131+
; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5]
11321132
; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1]
11331133
; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00
11341134
; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60

0 commit comments

Comments
 (0)