Skip to content

Commit 9e0fea6

Browse files
committed
[AMDGPU] Use fake16 load/store with +real-true16 and srcam-ecc
Fixes: SC1-6072
1 parent 9df1099 commit 9e0fea6

14 files changed

+3631
-1250
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2595,6 +2595,10 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() &&
25952595
// FIXME When we default to RealTrue16 instead of Fake, change the line as follows.
25962596
// AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;
25972597

2598+
def Use32BitLoadStoreWithTrue16Insts : True16PredicateClass<"Subtarget->useRealTrue16Insts() && "
2599+
"!Subtarget->d16PreservesUnusedBits()">,
2600+
AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureSRAMECC)>;
2601+
25982602
def HasD16Writes32BitVgpr: Predicate<"Subtarget->hasD16Writes32BitVgpr()">,
25992603
AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, FeatureD16Writes32BitVgpr)>;
26002604
def NotHasD16Writes32BitVgpr: Predicate<"!Subtarget->hasD16Writes32BitVgpr()">,

llvm/lib/Target/AMDGPU/DSInstructions.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -969,7 +969,7 @@ multiclass DSReadPat_t16<DS_Pseudo inst, ValueType vt, string frag> {
969969
}
970970

971971
let OtherPredicates = [NotLDSRequiresM0Init] in {
972-
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
972+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts, Use32BitLoadStoreWithTrue16Insts] in
973973
let True16Predicate = p in {
974974
def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
975975
}
@@ -1050,7 +1050,7 @@ multiclass DSWritePat_t16 <DS_Pseudo inst, ValueType vt, string frag> {
10501050
}
10511051

10521052
let OtherPredicates = [NotLDSRequiresM0Init] in {
1053-
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
1053+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts, Use32BitLoadStoreWithTrue16Insts] in
10541054
let True16Predicate = p in {
10551055
def : DSWritePat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
10561056
}

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1982,7 +1982,7 @@ defm : FlatLoadPats <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
19821982
defm : FlatLoadPats <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>;
19831983
defm : FlatLoadPats <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
19841984

1985-
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
1985+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts, Use32BitLoadStoreWithTrue16Insts] in
19861986
let True16Predicate = p in {
19871987
defm : FlatLoadPats <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
19881988
defm : FlatLoadPats <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
@@ -2127,7 +2127,7 @@ defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, extloadi16_global, i32>;
21272127
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, zextloadi16_global, i32>;
21282128
defm : GlobalFLATLoadPats <GLOBAL_LOAD_SSHORT, sextloadi16_global, i32>;
21292129

2130-
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
2130+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts, Use32BitLoadStoreWithTrue16Insts] in
21312131
let True16Predicate = p in {
21322132
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, extloadi8_global, i16>;
21332133
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, zextloadi8_global, i16>;
@@ -2187,7 +2187,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i32>;
21872187
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, truncstorei16_global, i32>;
21882188
defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX3, store_global, v3i32>;
21892189

2190-
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
2190+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts, Use32BitLoadStoreWithTrue16Insts] in
21912191
let OtherPredicates = [HasFlatGlobalInsts], True16Predicate = p in {
21922192
defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i16>;
21932193
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, store_global, i16>;
@@ -2356,7 +2356,7 @@ defm : ScratchFLATLoadPats <SCRATCH_LOAD_USHORT, extloadi16_private, i32>;
23562356
defm : ScratchFLATLoadPats <SCRATCH_LOAD_USHORT, zextloadi16_private, i32>;
23572357
defm : ScratchFLATLoadPats <SCRATCH_LOAD_SSHORT, sextloadi16_private, i32>;
23582358

2359-
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
2359+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts, Use32BitLoadStoreWithTrue16Insts] in
23602360
let True16Predicate = p in {
23612361
defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, extloadi8_private, i16>;
23622362
defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, zextloadi8_private, i16>;
@@ -2366,7 +2366,7 @@ defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, store_private, i16>;
23662366
defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE, truncstorei8_private, i16>;
23672367
}
23682368

2369-
let True16Predicate = UseRealTrue16Insts in {
2369+
let OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in {
23702370
defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", extloadi8_private, i16>;
23712371
defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", zextloadi8_private, i16>;
23722372
defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_SBYTE_D16", sextloadi8_private, i16>;

llvm/test/CodeGen/AMDGPU/atomic_load_local.ll

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
44
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
55
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
6+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-TRUE16 %s
7+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s
68

79
define i8 @atomic_load_monotonic_i8(ptr addrspace(3) %ptr) {
810
; CI-LABEL: atomic_load_monotonic_i8:
@@ -33,6 +35,14 @@ define i8 @atomic_load_monotonic_i8(ptr addrspace(3) %ptr) {
3335
; GFX11-FAKE16-NEXT: ds_load_u8 v0, v0
3436
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
3537
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
38+
;
39+
; GFX1250-LABEL: atomic_load_monotonic_i8:
40+
; GFX1250: ; %bb.0:
41+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
42+
; GFX1250-NEXT: s_wait_kmcnt 0x0
43+
; GFX1250-NEXT: ds_load_u8 v0, v0
44+
; GFX1250-NEXT: s_wait_dscnt 0x0
45+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
3646
%load = load atomic i8, ptr addrspace(3) %ptr monotonic, align 1
3747
ret i8 %load
3848
}
@@ -66,6 +76,14 @@ define i8 @atomic_load_monotonic_i8_offset(ptr addrspace(3) %ptr) {
6676
; GFX11-FAKE16-NEXT: ds_load_u8 v0, v0 offset:16
6777
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
6878
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
79+
;
80+
; GFX1250-LABEL: atomic_load_monotonic_i8_offset:
81+
; GFX1250: ; %bb.0:
82+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
83+
; GFX1250-NEXT: s_wait_kmcnt 0x0
84+
; GFX1250-NEXT: ds_load_u8 v0, v0 offset:16
85+
; GFX1250-NEXT: s_wait_dscnt 0x0
86+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
6987
%gep = getelementptr inbounds i8, ptr addrspace(3) %ptr, i8 16
7088
%load = load atomic i8, ptr addrspace(3) %gep monotonic, align 1
7189
ret i8 %load
@@ -100,6 +118,14 @@ define i16 @atomic_load_monotonic_i16(ptr addrspace(3) %ptr) {
100118
; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0
101119
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
102120
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
121+
;
122+
; GFX1250-LABEL: atomic_load_monotonic_i16:
123+
; GFX1250: ; %bb.0:
124+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
125+
; GFX1250-NEXT: s_wait_kmcnt 0x0
126+
; GFX1250-NEXT: ds_load_u16 v0, v0
127+
; GFX1250-NEXT: s_wait_dscnt 0x0
128+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
103129
%load = load atomic i16, ptr addrspace(3) %ptr monotonic, align 2
104130
ret i16 %load
105131
}
@@ -133,6 +159,14 @@ define i16 @atomic_load_monotonic_i16_offset(ptr addrspace(3) %ptr) {
133159
; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 offset:32
134160
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
135161
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
162+
;
163+
; GFX1250-LABEL: atomic_load_monotonic_i16_offset:
164+
; GFX1250: ; %bb.0:
165+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
166+
; GFX1250-NEXT: s_wait_kmcnt 0x0
167+
; GFX1250-NEXT: ds_load_u16 v0, v0 offset:32
168+
; GFX1250-NEXT: s_wait_dscnt 0x0
169+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
136170
%gep = getelementptr inbounds i16, ptr addrspace(3) %ptr, i16 16
137171
%load = load atomic i16, ptr addrspace(3) %gep monotonic, align 2
138172
ret i16 %load
@@ -160,6 +194,14 @@ define i32 @atomic_load_monotonic_i32(ptr addrspace(3) %ptr) {
160194
; GFX11-NEXT: ds_load_b32 v0, v0
161195
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
162196
; GFX11-NEXT: s_setpc_b64 s[30:31]
197+
;
198+
; GFX1250-LABEL: atomic_load_monotonic_i32:
199+
; GFX1250: ; %bb.0:
200+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
201+
; GFX1250-NEXT: s_wait_kmcnt 0x0
202+
; GFX1250-NEXT: ds_load_b32 v0, v0
203+
; GFX1250-NEXT: s_wait_dscnt 0x0
204+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
163205
%load = load atomic i32, ptr addrspace(3) %ptr monotonic, align 4
164206
ret i32 %load
165207
}
@@ -186,6 +228,14 @@ define i32 @atomic_load_monotonic_i32_offset(ptr addrspace(3) %ptr) {
186228
; GFX11-NEXT: ds_load_b32 v0, v0 offset:64
187229
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
188230
; GFX11-NEXT: s_setpc_b64 s[30:31]
231+
;
232+
; GFX1250-LABEL: atomic_load_monotonic_i32_offset:
233+
; GFX1250: ; %bb.0:
234+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
235+
; GFX1250-NEXT: s_wait_kmcnt 0x0
236+
; GFX1250-NEXT: ds_load_b32 v0, v0 offset:64
237+
; GFX1250-NEXT: s_wait_dscnt 0x0
238+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
189239
%gep = getelementptr inbounds i32, ptr addrspace(3) %ptr, i32 16
190240
%load = load atomic i32, ptr addrspace(3) %gep monotonic, align 4
191241
ret i32 %load
@@ -213,6 +263,14 @@ define i64 @atomic_load_monotonic_i64(ptr addrspace(3) %ptr) {
213263
; GFX11-NEXT: ds_load_b64 v[0:1], v0
214264
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
215265
; GFX11-NEXT: s_setpc_b64 s[30:31]
266+
;
267+
; GFX1250-LABEL: atomic_load_monotonic_i64:
268+
; GFX1250: ; %bb.0:
269+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
270+
; GFX1250-NEXT: s_wait_kmcnt 0x0
271+
; GFX1250-NEXT: ds_load_b64 v[0:1], v0
272+
; GFX1250-NEXT: s_wait_dscnt 0x0
273+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
216274
%load = load atomic i64, ptr addrspace(3) %ptr monotonic, align 8
217275
ret i64 %load
218276
}
@@ -239,6 +297,14 @@ define i64 @atomic_load_monotonic_i64_offset(ptr addrspace(3) %ptr) {
239297
; GFX11-NEXT: ds_load_b64 v[0:1], v0 offset:128
240298
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
241299
; GFX11-NEXT: s_setpc_b64 s[30:31]
300+
;
301+
; GFX1250-LABEL: atomic_load_monotonic_i64_offset:
302+
; GFX1250: ; %bb.0:
303+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
304+
; GFX1250-NEXT: s_wait_kmcnt 0x0
305+
; GFX1250-NEXT: ds_load_b64 v[0:1], v0 offset:128
306+
; GFX1250-NEXT: s_wait_dscnt 0x0
307+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
242308
%gep = getelementptr inbounds i64, ptr addrspace(3) %ptr, i32 16
243309
%load = load atomic i64, ptr addrspace(3) %gep monotonic, align 8
244310
ret i64 %load
@@ -266,6 +332,14 @@ define float @atomic_load_monotonic_f32_offset(ptr addrspace(3) %ptr) {
266332
; GFX11-NEXT: ds_load_b32 v0, v0 offset:64
267333
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
268334
; GFX11-NEXT: s_setpc_b64 s[30:31]
335+
;
336+
; GFX1250-LABEL: atomic_load_monotonic_f32_offset:
337+
; GFX1250: ; %bb.0:
338+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
339+
; GFX1250-NEXT: s_wait_kmcnt 0x0
340+
; GFX1250-NEXT: ds_load_b32 v0, v0 offset:64
341+
; GFX1250-NEXT: s_wait_dscnt 0x0
342+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
269343
%gep = getelementptr inbounds float, ptr addrspace(3) %ptr, i32 16
270344
%load = load atomic float, ptr addrspace(3) %gep monotonic, align 4
271345
ret float %load
@@ -293,6 +367,14 @@ define double @atomic_load_monotonic_f64_offset(ptr addrspace(3) %ptr) {
293367
; GFX11-NEXT: ds_load_b64 v[0:1], v0 offset:128
294368
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
295369
; GFX11-NEXT: s_setpc_b64 s[30:31]
370+
;
371+
; GFX1250-LABEL: atomic_load_monotonic_f64_offset:
372+
; GFX1250: ; %bb.0:
373+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
374+
; GFX1250-NEXT: s_wait_kmcnt 0x0
375+
; GFX1250-NEXT: ds_load_b64 v[0:1], v0 offset:128
376+
; GFX1250-NEXT: s_wait_dscnt 0x0
377+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
296378
%gep = getelementptr inbounds double, ptr addrspace(3) %ptr, i32 16
297379
%load = load atomic double, ptr addrspace(3) %gep monotonic, align 8
298380
ret double %load
@@ -320,6 +402,14 @@ define ptr @atomic_load_monotonic_p0i8_offset(ptr addrspace(3) %ptr) {
320402
; GFX11-NEXT: ds_load_b64 v[0:1], v0 offset:128
321403
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
322404
; GFX11-NEXT: s_setpc_b64 s[30:31]
405+
;
406+
; GFX1250-LABEL: atomic_load_monotonic_p0i8_offset:
407+
; GFX1250: ; %bb.0:
408+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
409+
; GFX1250-NEXT: s_wait_kmcnt 0x0
410+
; GFX1250-NEXT: ds_load_b64 v[0:1], v0 offset:128
411+
; GFX1250-NEXT: s_wait_dscnt 0x0
412+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
323413
%gep = getelementptr inbounds ptr, ptr addrspace(3) %ptr, i32 16
324414
%load = load atomic ptr, ptr addrspace(3) %gep monotonic, align 8
325415
ret ptr %load
@@ -347,6 +437,14 @@ define ptr addrspace(3) @atomic_load_monotonic_p3i8_offset(ptr addrspace(3) %ptr
347437
; GFX11-NEXT: ds_load_b32 v0, v0 offset:64
348438
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
349439
; GFX11-NEXT: s_setpc_b64 s[30:31]
440+
;
441+
; GFX1250-LABEL: atomic_load_monotonic_p3i8_offset:
442+
; GFX1250: ; %bb.0:
443+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
444+
; GFX1250-NEXT: s_wait_kmcnt 0x0
445+
; GFX1250-NEXT: ds_load_b32 v0, v0 offset:64
446+
; GFX1250-NEXT: s_wait_dscnt 0x0
447+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
350448
%gep = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %ptr, i32 16
351449
%load = load atomic ptr addrspace(3), ptr addrspace(3) %gep monotonic, align 4
352450
ret ptr addrspace(3) %load
@@ -381,6 +479,14 @@ define i16 @atomic_load_monotonic_f16(ptr addrspace(3) %ptr) {
381479
; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0
382480
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
383481
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
482+
;
483+
; GFX1250-LABEL: atomic_load_monotonic_f16:
484+
; GFX1250: ; %bb.0:
485+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
486+
; GFX1250-NEXT: s_wait_kmcnt 0x0
487+
; GFX1250-NEXT: ds_load_u16 v0, v0
488+
; GFX1250-NEXT: s_wait_dscnt 0x0
489+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
384490
%load = load atomic half, ptr addrspace(3) %ptr monotonic, align 2
385491
%ret = bitcast half %load to i16
386492
ret i16 %ret
@@ -415,6 +521,14 @@ define i16 @atomic_load_monotonic_f16_offset(ptr addrspace(3) %ptr) {
415521
; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 offset:32
416522
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
417523
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
524+
;
525+
; GFX1250-LABEL: atomic_load_monotonic_f16_offset:
526+
; GFX1250: ; %bb.0:
527+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
528+
; GFX1250-NEXT: s_wait_kmcnt 0x0
529+
; GFX1250-NEXT: ds_load_u16 v0, v0 offset:32
530+
; GFX1250-NEXT: s_wait_dscnt 0x0
531+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
418532
%gep = getelementptr inbounds half, ptr addrspace(3) %ptr, i32 16
419533
%load = load atomic half, ptr addrspace(3) %gep monotonic, align 2
420534
%ret = bitcast half %load to i16
@@ -450,6 +564,14 @@ define i16 @atomic_load_monotonic_bf16(ptr addrspace(3) %ptr) {
450564
; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0
451565
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
452566
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
567+
;
568+
; GFX1250-LABEL: atomic_load_monotonic_bf16:
569+
; GFX1250: ; %bb.0:
570+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
571+
; GFX1250-NEXT: s_wait_kmcnt 0x0
572+
; GFX1250-NEXT: ds_load_u16 v0, v0
573+
; GFX1250-NEXT: s_wait_dscnt 0x0
574+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
453575
%load = load atomic bfloat, ptr addrspace(3) %ptr monotonic, align 2
454576
%ret = bitcast bfloat %load to i16
455577
ret i16 %ret
@@ -484,10 +606,20 @@ define i16 @atomic_load_monotonic_bf16_offset(ptr addrspace(3) %ptr) {
484606
; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 offset:32
485607
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
486608
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
609+
;
610+
; GFX1250-LABEL: atomic_load_monotonic_bf16_offset:
611+
; GFX1250: ; %bb.0:
612+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
613+
; GFX1250-NEXT: s_wait_kmcnt 0x0
614+
; GFX1250-NEXT: ds_load_u16 v0, v0 offset:32
615+
; GFX1250-NEXT: s_wait_dscnt 0x0
616+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
487617
%gep = getelementptr inbounds bfloat, ptr addrspace(3) %ptr, i32 16
488618
%load = load atomic bfloat, ptr addrspace(3) %gep monotonic, align 2
489619
%ret = bitcast bfloat %load to i16
490620
ret i16 %ret
491621
}
492622
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
493623
; GCN: {{.*}}
624+
; GFX1250-FAKE16: {{.*}}
625+
; GFX1250-TRUE16: {{.*}}

0 commit comments

Comments
 (0)