Skip to content

Commit 9a93d9a

Browse files
jayfoadaokblast
authored andcommitted
[AMDGPU] Add target feature for waits before system scope stores. NFC. (llvm#164993)
1 parent 92cca42 commit 9a93d9a

File tree

4 files changed

+55
-13
lines changed

4 files changed

+55
-13
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1466,6 +1466,13 @@ def FeatureClusters : SubtargetFeature< "clusters",
14661466
"Has clusters of workgroups support"
14671467
>;
14681468

1469+
def FeatureWaitsBeforeSystemScopeStores : SubtargetFeature<
1470+
"waits-before-system-scope-stores",
1471+
"RequiresWaitsBeforeSystemScopeStores",
1472+
"true",
1473+
"Target requires waits for loads and atomics before system scope stores"
1474+
>;
1475+
14691476
// Dummy feature used to disable assembler instructions.
14701477
def FeatureDisable : SubtargetFeature<"",
14711478
"FeatureDisable","true",
@@ -2060,7 +2067,8 @@ def FeatureISAVersion12 : FeatureSet<
20602067
FeatureMaxHardClauseLength32,
20612068
Feature1_5xVGPRs,
20622069
FeatureMemoryAtomicFAddF32DenormalSupport,
2063-
FeatureBVHDualAndBVH8Insts
2070+
FeatureBVHDualAndBVH8Insts,
2071+
FeatureWaitsBeforeSystemScopeStores,
20642072
]>;
20652073

20662074
def FeatureISAVersion12_50 : FeatureSet<

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
290290
bool Has45BitNumRecordsBufferResource = false;
291291

292292
bool HasClusters = false;
293+
bool RequiresWaitsBeforeSystemScopeStores = false;
293294

294295
// Dummy feature to use for assembler in tablegen.
295296
bool FeatureDisable = false;
@@ -1861,6 +1862,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
18611862
bool has45BitNumRecordsBufferResource() const {
18621863
return Has45BitNumRecordsBufferResource;
18631864
}
1865+
1866+
bool requiresWaitsBeforeSystemScopeStores() const {
1867+
return RequiresWaitsBeforeSystemScopeStores;
1868+
}
18641869
};
18651870

18661871
class GCNUserSGPRUsageInfo {

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2673,7 +2673,8 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
26732673
const unsigned Scope = CPol->getImm() & CPol::SCOPE;
26742674

26752675
// GFX12.0 only: Extra waits needed before system scope stores.
2676-
if (!ST.hasGFX1250Insts() && !Atomic && Scope == CPol::SCOPE_SYS)
2676+
if (ST.requiresWaitsBeforeSystemScopeStores() && !Atomic &&
2677+
Scope == CPol::SCOPE_SYS)
26772678
Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator());
26782679

26792680
return Changed;
Lines changed: 39 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,50 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2-
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
3-
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
2+
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s
3+
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s
4+
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-SDAG %s
5+
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-GISEL %s
46

57
define amdgpu_ps void @intrinsic_store_system_scope(i32 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
6-
; GFX12-LABEL: intrinsic_store_system_scope:
7-
; GFX12: ; %bb.0:
8-
; GFX12-NEXT: buffer_store_b32 v0, v[1:2], s[0:3], s4 idxen offen scope:SCOPE_SYS
9-
; GFX12-NEXT: s_endpgm
8+
; GFX1200-LABEL: intrinsic_store_system_scope:
9+
; GFX1200: ; %bb.0:
10+
; GFX1200-NEXT: buffer_store_b32 v0, v[1:2], s[0:3], s4 idxen offen scope:SCOPE_SYS
11+
; GFX1200-NEXT: s_endpgm
12+
;
13+
; GFX1250-SDAG-LABEL: intrinsic_store_system_scope:
14+
; GFX1250-SDAG: ; %bb.0:
15+
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
16+
; GFX1250-SDAG-NEXT: buffer_store_b32 v0, v[2:3], s[0:3], s4 idxen offen scope:SCOPE_SYS
17+
; GFX1250-SDAG-NEXT: s_endpgm
18+
;
19+
; GFX1250-GISEL-LABEL: intrinsic_store_system_scope:
20+
; GFX1250-GISEL: ; %bb.0:
21+
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
22+
; GFX1250-GISEL-NEXT: buffer_store_b32 v0, v[4:5], s[0:3], s4 idxen offen scope:SCOPE_SYS
23+
; GFX1250-GISEL-NEXT: s_endpgm
1024
call void @llvm.amdgcn.struct.buffer.store.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 24)
1125
ret void
1226
}
1327

1428
define amdgpu_ps void @generic_store_volatile(i32 %val, ptr addrspace(1) %out) {
15-
; GFX12-LABEL: generic_store_volatile:
16-
; GFX12: ; %bb.0:
17-
; GFX12-NEXT: global_store_b32 v[1:2], v0, off scope:SCOPE_SYS
18-
; GFX12-NEXT: s_wait_storecnt 0x0
19-
; GFX12-NEXT: s_endpgm
29+
; GFX1200-LABEL: generic_store_volatile:
30+
; GFX1200: ; %bb.0:
31+
; GFX1200-NEXT: global_store_b32 v[1:2], v0, off scope:SCOPE_SYS
32+
; GFX1200-NEXT: s_wait_storecnt 0x0
33+
; GFX1200-NEXT: s_endpgm
34+
;
35+
; GFX1250-SDAG-LABEL: generic_store_volatile:
36+
; GFX1250-SDAG: ; %bb.0:
37+
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
38+
; GFX1250-SDAG-NEXT: global_store_b32 v[2:3], v0, off scope:SCOPE_SYS
39+
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
40+
; GFX1250-SDAG-NEXT: s_endpgm
41+
;
42+
; GFX1250-GISEL-LABEL: generic_store_volatile:
43+
; GFX1250-GISEL: ; %bb.0:
44+
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
45+
; GFX1250-GISEL-NEXT: global_store_b32 v[4:5], v0, off scope:SCOPE_SYS
46+
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
47+
; GFX1250-GISEL-NEXT: s_endpgm
2048
store volatile i32 %val, ptr addrspace(1) %out
2149
ret void
2250
}

0 commit comments

Comments
 (0)