Skip to content

Commit 97a66a8

Browse files
authored
[AMDGPU] Prohibit load/store merge if scale_offset is set on gfx1250 (#149895)
Scaling is done on the operation size, by merging instructions we would need to generate code to scale the offset and reset the auto-scale bit. This is unclear if that would be beneficial, just disable such merge for now.
1 parent 8f26a30 commit 97a66a8

File tree

2 files changed

+108
-1
lines changed

2 files changed

+108
-1
lines changed

llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
#include "AMDGPU.h"
6262
#include "GCNSubtarget.h"
6363
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
64+
#include "SIDefines.h"
6465
#include "llvm/Analysis/AliasAnalysis.h"
6566
#include "llvm/CodeGen/MachineFunctionPass.h"
6667
#include "llvm/InitializePasses.h"
@@ -1078,7 +1079,9 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
10781079
if (EltOffset0 + CI.Width != EltOffset1 &&
10791080
EltOffset1 + Paired.Width != EltOffset0)
10801081
return false;
1081-
if (CI.CPol != Paired.CPol)
1082+
// Instructions with scale_offset modifier cannot be combined unless we
1083+
// also generate a code to scale the offset and reset that bit.
1084+
if (CI.CPol != Paired.CPol || (CI.CPol & AMDGPU::CPol::SCAL))
10821085
return false;
10831086
if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
10841087
CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
2+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-load-store-opt -o - %s | FileCheck -check-prefix=GCN %s
3+
4+
---
5+
name: merge_global_load_dword_2_no_scale_offset
6+
body: |
7+
bb.0.entry:
8+
9+
; GCN-LABEL: name: merge_global_load_dword_2_no_scale_offset
10+
; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
11+
; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
12+
; GCN-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR [[DEF]], [[DEF1]], 0, 1, implicit $exec :: (load (s64) from `ptr addrspace(1) undef` + 4, align 4, addrspace 1)
13+
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0
14+
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1
15+
; GCN-NEXT: S_NOP 0, implicit [[DEF1]], implicit [[COPY]]
16+
%0:sreg_64_xexec_xnull = IMPLICIT_DEF
17+
%1:vgpr_32 = IMPLICIT_DEF
18+
%2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 1, implicit $exec :: (load (s32) from `float addrspace(1)* undef` + 4, basealign 4, addrspace 1)
19+
%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1)
20+
S_NOP 0, implicit %1, implicit %2
21+
...
22+
23+
---
24+
name: no_merge_global_load_dword_2_same_scale_offset
25+
body: |
26+
bb.0.entry:
27+
28+
; GCN-LABEL: name: no_merge_global_load_dword_2_same_scale_offset
29+
; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
30+
; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
31+
; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 2049, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, addrspace 1)
32+
; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 4, 2049, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, addrspace 1)
33+
; GCN-NEXT: S_NOP 0, implicit [[DEF1]], implicit [[GLOBAL_LOAD_DWORD_SADDR]]
34+
%0:sreg_64_xexec_xnull = IMPLICIT_DEF
35+
%1:vgpr_32 = IMPLICIT_DEF
36+
%2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 2049, implicit $exec :: (load (s32) from `float addrspace(1)* undef` + 4, basealign 4, addrspace 1)
37+
%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 2049, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1)
38+
S_NOP 0, implicit %1, implicit %2
39+
...
40+
41+
---
42+
name: no_merge_global_load_dword_2_different_scale_offset
43+
body: |
44+
bb.0.entry:
45+
46+
; GCN-LABEL: name: no_merge_global_load_dword_2_different_scale_offset
47+
; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
48+
; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
49+
; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, addrspace 1)
50+
; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 4, 2048, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, addrspace 1)
51+
; GCN-NEXT: S_NOP 0, implicit [[DEF1]], implicit [[GLOBAL_LOAD_DWORD_SADDR]]
52+
%0:sreg_64_xexec_xnull = IMPLICIT_DEF
53+
%1:vgpr_32 = IMPLICIT_DEF
54+
%2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from `float addrspace(1)* undef` + 4, basealign 4, addrspace 1)
55+
%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 2048, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1)
56+
S_NOP 0, implicit %1, implicit %2
57+
...
58+
59+
# NB: We do not currently support merging SGPR offset and SGPR+Imm offset forms
60+
# of S_LOAD, but the check stays the same: these cannot be merged with different
61+
# scale offsets.
62+
#
63+
# We also do not currently merge flat scratch instructions, although a common
64+
# check in the merge logic that CPol shall not be set for merge to happen.
65+
66+
---
67+
name: merge_s_load_x1_x1_imm_no_scale_offset
68+
body: |
69+
bb.0:
70+
; GCN-LABEL: name: merge_s_load_x1_x1_imm_no_scale_offset
71+
; GCN: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
72+
; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
73+
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0
74+
; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
75+
%0:sgpr_64 = IMPLICIT_DEF
76+
%1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32))
77+
%2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s32))
78+
...
79+
80+
---
81+
name: no_merge_s_load_x1_x1_imm_same_scale_offset
82+
body: |
83+
bb.0:
84+
; GCN-LABEL: name: no_merge_s_load_x1_x1_imm_same_scale_offset
85+
; GCN: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
86+
; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 2048 :: (dereferenceable invariant load (s32))
87+
; GCN-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 4, 2048 :: (dereferenceable invariant load (s32))
88+
%0:sgpr_64 = IMPLICIT_DEF
89+
%1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 2048 :: (dereferenceable invariant load (s32))
90+
%2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 2048 :: (dereferenceable invariant load (s32))
91+
...
92+
93+
---
94+
name: no_merge_s_load_x1_x1_imm_different_scale_offset
95+
body: |
96+
bb.0:
97+
; GCN-LABEL: name: no_merge_s_load_x1_x1_imm_different_scale_offset
98+
; GCN: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
99+
; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s32))
100+
; GCN-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 4, 2048 :: (dereferenceable invariant load (s32))
101+
%0:sgpr_64 = IMPLICIT_DEF
102+
%1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32))
103+
%2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 2048 :: (dereferenceable invariant load (s32))
104+
...

0 commit comments

Comments
 (0)