@@ -6741,6 +6741,81 @@ entry:
67416741 ret void
67426742}
67436743
6744+ define amdgpu_kernel void @atomic_store_bf16_offset (bfloat %in , ptr %out ) {
6745+ ; GCN1-LABEL: atomic_store_bf16_offset:
6746+ ; GCN1: ; %bb.0:
6747+ ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
6748+ ; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9
6749+ ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6750+ ; GCN1-NEXT: v_mov_b32_e32 v0, s2
6751+ ; GCN1-NEXT: v_mov_b32_e32 v1, s3
6752+ ; GCN1-NEXT: v_mov_b32_e32 v2, s0
6753+ ; GCN1-NEXT: flat_store_short v[0:1], v2
6754+ ; GCN1-NEXT: s_endpgm
6755+ ;
6756+ ; GCN2-LABEL: atomic_store_bf16_offset:
6757+ ; GCN2: ; %bb.0:
6758+ ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6759+ ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24
6760+ ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6761+ ; GCN2-NEXT: v_mov_b32_e32 v0, s2
6762+ ; GCN2-NEXT: v_mov_b32_e32 v1, s3
6763+ ; GCN2-NEXT: v_mov_b32_e32 v2, s0
6764+ ; GCN2-NEXT: flat_store_short v[0:1], v2
6765+ ; GCN2-NEXT: s_endpgm
6766+ ;
6767+ ; GCN3-LABEL: atomic_store_bf16_offset:
6768+ ; GCN3: ; %bb.0:
6769+ ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6770+ ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24
6771+ ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6772+ ; GCN3-NEXT: v_mov_b32_e32 v0, s2
6773+ ; GCN3-NEXT: v_mov_b32_e32 v1, s3
6774+ ; GCN3-NEXT: v_mov_b32_e32 v2, s4
6775+ ; GCN3-NEXT: flat_store_short v[0:1], v2
6776+ ; GCN3-NEXT: s_endpgm
6777+ %gep = getelementptr bfloat, ptr %out , i64 8
6778+ store atomic bfloat %in , ptr %out seq_cst , align 2
6779+ ret void
6780+ }
6781+
6782+ define amdgpu_kernel void @atomic_store_bf16 (bfloat %in , ptr %out ) {
6783+ ; GCN1-LABEL: atomic_store_bf16:
6784+ ; GCN1: ; %bb.0:
6785+ ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
6786+ ; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9
6787+ ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6788+ ; GCN1-NEXT: v_mov_b32_e32 v0, s2
6789+ ; GCN1-NEXT: v_mov_b32_e32 v1, s3
6790+ ; GCN1-NEXT: v_mov_b32_e32 v2, s0
6791+ ; GCN1-NEXT: flat_store_short v[0:1], v2
6792+ ; GCN1-NEXT: s_endpgm
6793+ ;
6794+ ; GCN2-LABEL: atomic_store_bf16:
6795+ ; GCN2: ; %bb.0:
6796+ ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6797+ ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24
6798+ ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6799+ ; GCN2-NEXT: v_mov_b32_e32 v0, s2
6800+ ; GCN2-NEXT: v_mov_b32_e32 v1, s3
6801+ ; GCN2-NEXT: v_mov_b32_e32 v2, s0
6802+ ; GCN2-NEXT: flat_store_short v[0:1], v2
6803+ ; GCN2-NEXT: s_endpgm
6804+ ;
6805+ ; GCN3-LABEL: atomic_store_bf16:
6806+ ; GCN3: ; %bb.0:
6807+ ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6808+ ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24
6809+ ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6810+ ; GCN3-NEXT: v_mov_b32_e32 v0, s2
6811+ ; GCN3-NEXT: v_mov_b32_e32 v1, s3
6812+ ; GCN3-NEXT: v_mov_b32_e32 v2, s4
6813+ ; GCN3-NEXT: flat_store_short v[0:1], v2
6814+ ; GCN3-NEXT: s_endpgm
6815+ store atomic bfloat %in , ptr %out seq_cst , align 2
6816+ ret void
6817+ }
6818+
67446819define amdgpu_kernel void @atomic_inc_i32_offset (ptr %out , i32 %in ) {
67456820; GCN1-LABEL: atomic_inc_i32_offset:
67466821; GCN1: ; %bb.0: ; %entry
@@ -7868,3 +7943,201 @@ entry:
78687943 store i32 %val , ptr %out2
78697944 ret void
78707945}
7946+
7947+ define amdgpu_kernel void @atomic_load_f16_offset (ptr %in , ptr %out ) {
7948+ ; GCN1-LABEL: atomic_load_f16_offset:
7949+ ; GCN1: ; %bb.0:
7950+ ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
7951+ ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7952+ ; GCN1-NEXT: s_add_u32 s0, s0, 16
7953+ ; GCN1-NEXT: s_addc_u32 s1, s1, 0
7954+ ; GCN1-NEXT: v_mov_b32_e32 v0, s0
7955+ ; GCN1-NEXT: v_mov_b32_e32 v1, s1
7956+ ; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
7957+ ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7958+ ; GCN1-NEXT: buffer_wbinvl1_vol
7959+ ; GCN1-NEXT: v_mov_b32_e32 v0, s2
7960+ ; GCN1-NEXT: v_mov_b32_e32 v1, s3
7961+ ; GCN1-NEXT: flat_store_short v[0:1], v2
7962+ ; GCN1-NEXT: s_endpgm
7963+ ;
7964+ ; GCN2-LABEL: atomic_load_f16_offset:
7965+ ; GCN2: ; %bb.0:
7966+ ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
7967+ ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7968+ ; GCN2-NEXT: s_add_u32 s0, s0, 16
7969+ ; GCN2-NEXT: s_addc_u32 s1, s1, 0
7970+ ; GCN2-NEXT: v_mov_b32_e32 v0, s0
7971+ ; GCN2-NEXT: v_mov_b32_e32 v1, s1
7972+ ; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
7973+ ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7974+ ; GCN2-NEXT: buffer_wbinvl1_vol
7975+ ; GCN2-NEXT: v_mov_b32_e32 v0, s2
7976+ ; GCN2-NEXT: v_mov_b32_e32 v1, s3
7977+ ; GCN2-NEXT: flat_store_short v[0:1], v2
7978+ ; GCN2-NEXT: s_endpgm
7979+ ;
7980+ ; GCN3-LABEL: atomic_load_f16_offset:
7981+ ; GCN3: ; %bb.0:
7982+ ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
7983+ ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
7984+ ; GCN3-NEXT: v_mov_b32_e32 v0, s0
7985+ ; GCN3-NEXT: v_mov_b32_e32 v1, s1
7986+ ; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
7987+ ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7988+ ; GCN3-NEXT: buffer_wbinvl1_vol
7989+ ; GCN3-NEXT: v_mov_b32_e32 v0, s2
7990+ ; GCN3-NEXT: v_mov_b32_e32 v1, s3
7991+ ; GCN3-NEXT: flat_store_short v[0:1], v2
7992+ ; GCN3-NEXT: s_endpgm
7993+ %gep = getelementptr half , ptr %in , i64 8
7994+ %val = load atomic half , ptr %gep seq_cst , align 2
7995+ store half %val , ptr %out
7996+ ret void
7997+ }
7998+
7999+ define amdgpu_kernel void @atomic_load_f16 (ptr %in , ptr %out ) {
8000+ ; GCN1-LABEL: atomic_load_f16:
8001+ ; GCN1: ; %bb.0:
8002+ ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
8003+ ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
8004+ ; GCN1-NEXT: v_mov_b32_e32 v0, s0
8005+ ; GCN1-NEXT: v_mov_b32_e32 v1, s1
8006+ ; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
8007+ ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8008+ ; GCN1-NEXT: buffer_wbinvl1_vol
8009+ ; GCN1-NEXT: v_mov_b32_e32 v0, s2
8010+ ; GCN1-NEXT: v_mov_b32_e32 v1, s3
8011+ ; GCN1-NEXT: flat_store_short v[0:1], v2
8012+ ; GCN1-NEXT: s_endpgm
8013+ ;
8014+ ; GCN2-LABEL: atomic_load_f16:
8015+ ; GCN2: ; %bb.0:
8016+ ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
8017+ ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
8018+ ; GCN2-NEXT: v_mov_b32_e32 v0, s0
8019+ ; GCN2-NEXT: v_mov_b32_e32 v1, s1
8020+ ; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
8021+ ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8022+ ; GCN2-NEXT: buffer_wbinvl1_vol
8023+ ; GCN2-NEXT: v_mov_b32_e32 v0, s2
8024+ ; GCN2-NEXT: v_mov_b32_e32 v1, s3
8025+ ; GCN2-NEXT: flat_store_short v[0:1], v2
8026+ ; GCN2-NEXT: s_endpgm
8027+ ;
8028+ ; GCN3-LABEL: atomic_load_f16:
8029+ ; GCN3: ; %bb.0:
8030+ ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
8031+ ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
8032+ ; GCN3-NEXT: v_mov_b32_e32 v0, s0
8033+ ; GCN3-NEXT: v_mov_b32_e32 v1, s1
8034+ ; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc
8035+ ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8036+ ; GCN3-NEXT: buffer_wbinvl1_vol
8037+ ; GCN3-NEXT: v_mov_b32_e32 v0, s2
8038+ ; GCN3-NEXT: v_mov_b32_e32 v1, s3
8039+ ; GCN3-NEXT: flat_store_short v[0:1], v2
8040+ ; GCN3-NEXT: s_endpgm
8041+ %val = load atomic half , ptr %in seq_cst , align 2
8042+ store half %val , ptr %out
8043+ ret void
8044+ }
8045+
8046+ define amdgpu_kernel void @atomic_load_bf16_offset (ptr %in , ptr %out ) {
8047+ ; GCN1-LABEL: atomic_load_bf16_offset:
8048+ ; GCN1: ; %bb.0:
8049+ ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
8050+ ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
8051+ ; GCN1-NEXT: s_add_u32 s0, s0, 16
8052+ ; GCN1-NEXT: s_addc_u32 s1, s1, 0
8053+ ; GCN1-NEXT: v_mov_b32_e32 v0, s0
8054+ ; GCN1-NEXT: v_mov_b32_e32 v1, s1
8055+ ; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
8056+ ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8057+ ; GCN1-NEXT: buffer_wbinvl1_vol
8058+ ; GCN1-NEXT: v_mov_b32_e32 v0, s2
8059+ ; GCN1-NEXT: v_mov_b32_e32 v1, s3
8060+ ; GCN1-NEXT: flat_store_short v[0:1], v2
8061+ ; GCN1-NEXT: s_endpgm
8062+ ;
8063+ ; GCN2-LABEL: atomic_load_bf16_offset:
8064+ ; GCN2: ; %bb.0:
8065+ ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
8066+ ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
8067+ ; GCN2-NEXT: s_add_u32 s0, s0, 16
8068+ ; GCN2-NEXT: s_addc_u32 s1, s1, 0
8069+ ; GCN2-NEXT: v_mov_b32_e32 v0, s0
8070+ ; GCN2-NEXT: v_mov_b32_e32 v1, s1
8071+ ; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
8072+ ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8073+ ; GCN2-NEXT: buffer_wbinvl1_vol
8074+ ; GCN2-NEXT: v_mov_b32_e32 v0, s2
8075+ ; GCN2-NEXT: v_mov_b32_e32 v1, s3
8076+ ; GCN2-NEXT: flat_store_short v[0:1], v2
8077+ ; GCN2-NEXT: s_endpgm
8078+ ;
8079+ ; GCN3-LABEL: atomic_load_bf16_offset:
8080+ ; GCN3: ; %bb.0:
8081+ ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
8082+ ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
8083+ ; GCN3-NEXT: v_mov_b32_e32 v0, s0
8084+ ; GCN3-NEXT: v_mov_b32_e32 v1, s1
8085+ ; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
8086+ ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8087+ ; GCN3-NEXT: buffer_wbinvl1_vol
8088+ ; GCN3-NEXT: v_mov_b32_e32 v0, s2
8089+ ; GCN3-NEXT: v_mov_b32_e32 v1, s3
8090+ ; GCN3-NEXT: flat_store_short v[0:1], v2
8091+ ; GCN3-NEXT: s_endpgm
8092+ %gep = getelementptr bfloat, ptr %in , i64 8
8093+ %val = load atomic bfloat, ptr %gep seq_cst , align 2
8094+ store bfloat %val , ptr %out
8095+ ret void
8096+ }
8097+
8098+ define amdgpu_kernel void @atomic_load_bf16 (ptr %in , ptr %out ) {
8099+ ; GCN1-LABEL: atomic_load_bf16:
8100+ ; GCN1: ; %bb.0:
8101+ ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
8102+ ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
8103+ ; GCN1-NEXT: v_mov_b32_e32 v0, s0
8104+ ; GCN1-NEXT: v_mov_b32_e32 v1, s1
8105+ ; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
8106+ ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8107+ ; GCN1-NEXT: buffer_wbinvl1_vol
8108+ ; GCN1-NEXT: v_mov_b32_e32 v0, s2
8109+ ; GCN1-NEXT: v_mov_b32_e32 v1, s3
8110+ ; GCN1-NEXT: flat_store_short v[0:1], v2
8111+ ; GCN1-NEXT: s_endpgm
8112+ ;
8113+ ; GCN2-LABEL: atomic_load_bf16:
8114+ ; GCN2: ; %bb.0:
8115+ ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
8116+ ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
8117+ ; GCN2-NEXT: v_mov_b32_e32 v0, s0
8118+ ; GCN2-NEXT: v_mov_b32_e32 v1, s1
8119+ ; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
8120+ ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8121+ ; GCN2-NEXT: buffer_wbinvl1_vol
8122+ ; GCN2-NEXT: v_mov_b32_e32 v0, s2
8123+ ; GCN2-NEXT: v_mov_b32_e32 v1, s3
8124+ ; GCN2-NEXT: flat_store_short v[0:1], v2
8125+ ; GCN2-NEXT: s_endpgm
8126+ ;
8127+ ; GCN3-LABEL: atomic_load_bf16:
8128+ ; GCN3: ; %bb.0:
8129+ ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
8130+ ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
8131+ ; GCN3-NEXT: v_mov_b32_e32 v0, s0
8132+ ; GCN3-NEXT: v_mov_b32_e32 v1, s1
8133+ ; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc
8134+ ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8135+ ; GCN3-NEXT: buffer_wbinvl1_vol
8136+ ; GCN3-NEXT: v_mov_b32_e32 v0, s2
8137+ ; GCN3-NEXT: v_mov_b32_e32 v1, s3
8138+ ; GCN3-NEXT: flat_store_short v[0:1], v2
8139+ ; GCN3-NEXT: s_endpgm
8140+ %val = load atomic bfloat, ptr %in seq_cst , align 2
8141+ store bfloat %val , ptr %out
8142+ ret void
8143+ }
0 commit comments