Skip to content

Commit 4a7ebb1

Browse files
committed
[LoadStoreVectorizer] Fix tests
1 parent 51a0c30 commit 4a7ebb1

9 files changed

+169
-201
lines changed

llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3850,8 +3850,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
38503850
; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
38513851
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) poison`, addrspace 4)
38523852
; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<32 x s32>) from %ir.ptr0, addrspace 1)
3853-
; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(p3) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (p3) from `ptr addrspace(1) poison`, addrspace 1)
3854-
; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(p5) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (p5) from `ptr addrspace(1) poison`, addrspace 1)
3853+
; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (s32) from `ptr addrspace(1) poison`, addrspace 1)
3854+
; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[LOAD2]](s32)
3855+
; CHECK-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p5) = G_INTTOPTR [[LOAD2]](s32)
38553856
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
38563857
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32_p3_p5
38573858
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -3880,10 +3881,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
38803881
; CHECK-NEXT: G_STORE [[UV31]](s32), [[PTR_ADD1]](p5) :: (store (s32) into stack, align 16, addrspace 5)
38813882
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
38823883
; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C4]](s32)
3883-
; CHECK-NEXT: G_STORE [[LOAD2]](p3), [[PTR_ADD2]](p5) :: (store (p3) into stack + 4, addrspace 5)
3884+
; CHECK-NEXT: G_STORE [[INTTOPTR]](p3), [[PTR_ADD2]](p5) :: (store (p3) into stack + 4, addrspace 5)
38843885
; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
38853886
; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C5]](s32)
3886-
; CHECK-NEXT: G_STORE [[LOAD3]](p5), [[PTR_ADD3]](p5) :: (store (p5) into stack + 8, align 8, addrspace 5)
3887+
; CHECK-NEXT: G_STORE [[INTTOPTR1]](p5), [[PTR_ADD3]](p5) :: (store (p5) into stack + 8, align 8, addrspace 5)
38873888
; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
38883889
; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
38893890
; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32)

llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll

Lines changed: 25 additions & 26 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ define amdgpu_kernel void @uniform_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x
88
; GCN-NEXT: {{ $}}
99
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
1010
; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
11-
; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 11, 0 :: (dereferenceable invariant load (s32) from %ir.z.kernarg.offset.align.down, addrspace 4)
11+
; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 11, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset.align.down, addrspace 4)
1212
; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
1313
; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
1414
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440

llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,28 +6,26 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1
66
; CHECK: ; %bb.0: ; %bb
77
; CHECK-NEXT: s_mov_b64 s[26:27], s[2:3]
88
; CHECK-NEXT: s_mov_b64 s[24:25], s[0:1]
9-
; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0
109
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
11-
; CHECK-NEXT: s_load_dword s6, s[8:9], 0x4
1210
; CHECK-NEXT: s_add_u32 s24, s24, s17
1311
; CHECK-NEXT: s_addc_u32 s25, s25, 0
1412
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
15-
; CHECK-NEXT: s_bitcmp1_b32 s2, 0
16-
; CHECK-NEXT: s_cselect_b64 s[16:17], -1, 0
17-
; CHECK-NEXT: s_bitcmp1_b32 s2, 8
13+
; CHECK-NEXT: s_bitcmp1_b32 s0, 0
14+
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
15+
; CHECK-NEXT: s_bitcmp1_b32 s0, 8
1816
; CHECK-NEXT: s_cselect_b64 s[10:11], -1, 0
19-
; CHECK-NEXT: s_bitcmp1_b32 s2, 16
17+
; CHECK-NEXT: s_bitcmp1_b32 s0, 16
18+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
2019
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
2120
; CHECK-NEXT: s_bitcmp1_b32 s0, 24
2221
; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0
2322
; CHECK-NEXT: s_xor_b64 s[4:5], s[8:9], -1
2423
; CHECK-NEXT: s_bitcmp1_b32 s1, 0
25-
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
24+
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
2625
; CHECK-NEXT: s_cselect_b64 s[12:13], -1, 0
27-
; CHECK-NEXT: s_bitcmp1_b32 s6, 8
28-
; CHECK-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v0
29-
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[16:17]
26+
; CHECK-NEXT: s_bitcmp1_b32 s1, 8
3027
; CHECK-NEXT: s_cselect_b64 s[14:15], -1, 0
28+
; CHECK-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v1
3129
; CHECK-NEXT: s_and_b64 s[4:5], exec, s[4:5]
3230
; CHECK-NEXT: s_and_b64 s[6:7], exec, s[10:11]
3331
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0

llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll

Lines changed: 63 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -862,160 +862,138 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16
862862
define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
863863
; VI-DENORM-LABEL: multiple_use_fadd_multi_fmad_f16:
864864
; VI-DENORM: ; %bb.0:
865-
; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
866-
; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
867-
; VI-DENORM-NEXT: s_load_dword s6, s[8:9], 0x8
865+
; VI-DENORM-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
868866
; VI-DENORM-NEXT: s_add_i32 s12, s12, s17
869867
; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
868+
; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13
870869
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
871-
; VI-DENORM-NEXT: s_lshr_b32 s0, s0, 16
870+
; VI-DENORM-NEXT: s_lshr_b32 s5, s2, 16
871+
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3
872+
; VI-DENORM-NEXT: v_mov_b32_e32 v1, s5
873+
; VI-DENORM-NEXT: v_fma_f16 v2, |s2|, 2.0, v0
874+
; VI-DENORM-NEXT: v_fma_f16 v3, |s2|, 2.0, v1
872875
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0
873-
; VI-DENORM-NEXT: v_fma_f16 v2, |s6|, 2.0, v0
874-
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s1
875-
; VI-DENORM-NEXT: v_fma_f16 v3, |s6|, 2.0, v0
876-
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2
877-
; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13
878-
; VI-DENORM-NEXT: s_add_u32 s4, s2, 2
879-
; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
880-
; VI-DENORM-NEXT: s_addc_u32 s5, s3, 0
881-
; VI-DENORM-NEXT: flat_store_short v[0:1], v2
876+
; VI-DENORM-NEXT: s_add_u32 s4, s0, 2
877+
; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
878+
; VI-DENORM-NEXT: s_addc_u32 s5, s1, 0
879+
; VI-DENORM-NEXT: flat_store_short v[0:1], v3
882880
; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
883881
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s4
884882
; VI-DENORM-NEXT: v_mov_b32_e32 v1, s5
885-
; VI-DENORM-NEXT: flat_store_short v[0:1], v3
883+
; VI-DENORM-NEXT: flat_store_short v[0:1], v2
886884
; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
887885
; VI-DENORM-NEXT: s_endpgm
888886
;
889887
; VI-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16:
890888
; VI-FLUSH: ; %bb.0:
891-
; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
892-
; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
893-
; VI-FLUSH-NEXT: s_load_dword s6, s[8:9], 0x8
889+
; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
894890
; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17
895891
; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
892+
; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13
896893
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
897-
; VI-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
894+
; VI-FLUSH-NEXT: s_lshr_b32 s5, s2, 16
895+
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s3
896+
; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s5
897+
; VI-FLUSH-NEXT: v_mad_f16 v2, |s2|, 2.0, v0
898+
; VI-FLUSH-NEXT: v_mad_f16 v3, |s2|, 2.0, v1
898899
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
899-
; VI-FLUSH-NEXT: v_mad_f16 v2, |s6|, 2.0, v0
900-
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s1
901-
; VI-FLUSH-NEXT: v_mad_f16 v3, |s6|, 2.0, v0
902-
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2
903-
; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13
904-
; VI-FLUSH-NEXT: s_add_u32 s4, s2, 2
905-
; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
906-
; VI-FLUSH-NEXT: s_addc_u32 s5, s3, 0
907-
; VI-FLUSH-NEXT: flat_store_short v[0:1], v2
900+
; VI-FLUSH-NEXT: s_add_u32 s4, s0, 2
901+
; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
902+
; VI-FLUSH-NEXT: s_addc_u32 s5, s1, 0
903+
; VI-FLUSH-NEXT: flat_store_short v[0:1], v3
908904
; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
909905
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s4
910906
; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s5
911-
; VI-FLUSH-NEXT: flat_store_short v[0:1], v3
907+
; VI-FLUSH-NEXT: flat_store_short v[0:1], v2
912908
; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
913909
; VI-FLUSH-NEXT: s_endpgm
914910
;
915911
; GFX10-DENORM-LABEL: multiple_use_fadd_multi_fmad_f16:
916912
; GFX10-DENORM: ; %bb.0:
917-
; GFX10-DENORM-NEXT: s_clause 0x2
918-
; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
919-
; GFX10-DENORM-NEXT: s_load_dword s4, s[8:9], 0x8
920-
; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
913+
; GFX10-DENORM-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
921914
; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0
922915
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
923-
; GFX10-DENORM-NEXT: s_lshr_b32 s0, s0, 16
924-
; GFX10-DENORM-NEXT: v_fma_f16 v2, |s4|, 2.0, s1
925-
; GFX10-DENORM-NEXT: v_fma_f16 v1, |s4|, 2.0, s0
926-
; GFX10-DENORM-NEXT: global_store_short v0, v1, s[2:3]
916+
; GFX10-DENORM-NEXT: s_lshr_b32 s4, s2, 16
917+
; GFX10-DENORM-NEXT: v_fma_f16 v2, |s2|, 2.0, s3
918+
; GFX10-DENORM-NEXT: v_fma_f16 v1, |s2|, 2.0, s4
919+
; GFX10-DENORM-NEXT: global_store_short v0, v1, s[0:1]
927920
; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
928-
; GFX10-DENORM-NEXT: global_store_short v0, v2, s[2:3] offset:2
921+
; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1] offset:2
929922
; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
930923
; GFX10-DENORM-NEXT: s_endpgm
931924
;
932925
; GFX10-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16:
933926
; GFX10-FLUSH: ; %bb.0:
934-
; GFX10-FLUSH-NEXT: s_clause 0x2
935-
; GFX10-FLUSH-NEXT: s_load_dword s4, s[8:9], 0x8
936-
; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
937-
; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
927+
; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
938928
; GFX10-FLUSH-NEXT: v_mov_b32_e32 v1, 0
939929
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
940-
; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |s4|, |s4|
941-
; GFX10-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
942-
; GFX10-FLUSH-NEXT: v_add_f16_e32 v2, s0, v0
943-
; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, s1, v0
944-
; GFX10-FLUSH-NEXT: global_store_short v1, v2, s[2:3]
930+
; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |s2|, |s2|
931+
; GFX10-FLUSH-NEXT: s_lshr_b32 s2, s2, 16
932+
; GFX10-FLUSH-NEXT: v_add_f16_e32 v2, s2, v0
933+
; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, s3, v0
934+
; GFX10-FLUSH-NEXT: global_store_short v1, v2, s[0:1]
945935
; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
946-
; GFX10-FLUSH-NEXT: global_store_short v1, v0, s[2:3] offset:2
936+
; GFX10-FLUSH-NEXT: global_store_short v1, v0, s[0:1] offset:2
947937
; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
948938
; GFX10-FLUSH-NEXT: s_endpgm
949939
;
950940
; GFX11-DENORM-TRUE16-LABEL: multiple_use_fadd_multi_fmad_f16:
951941
; GFX11-DENORM-TRUE16: ; %bb.0:
952-
; GFX11-DENORM-TRUE16-NEXT: s_clause 0x2
953-
; GFX11-DENORM-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
954-
; GFX11-DENORM-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x8
955-
; GFX11-DENORM-TRUE16-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
942+
; GFX11-DENORM-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
956943
; GFX11-DENORM-TRUE16-NEXT: v_mov_b32_e32 v1, 0
957944
; GFX11-DENORM-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
958-
; GFX11-DENORM-TRUE16-NEXT: s_lshr_b32 s0, s0, 16
959-
; GFX11-DENORM-TRUE16-NEXT: v_fma_f16 v0.h, |s6|, 2.0, s1
960-
; GFX11-DENORM-TRUE16-NEXT: v_fma_f16 v0.l, |s6|, 2.0, s0
961-
; GFX11-DENORM-TRUE16-NEXT: global_store_b16 v1, v0, s[2:3] dlc
945+
; GFX11-DENORM-TRUE16-NEXT: s_lshr_b32 s4, s2, 16
946+
; GFX11-DENORM-TRUE16-NEXT: v_fma_f16 v0.h, |s2|, 2.0, s3
947+
; GFX11-DENORM-TRUE16-NEXT: v_fma_f16 v0.l, |s2|, 2.0, s4
948+
; GFX11-DENORM-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] dlc
962949
; GFX11-DENORM-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
963-
; GFX11-DENORM-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[2:3] offset:2 dlc
950+
; GFX11-DENORM-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1] offset:2 dlc
964951
; GFX11-DENORM-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
965952
; GFX11-DENORM-TRUE16-NEXT: s_endpgm
966953
;
967954
; GFX11-DENORM-FAKE16-LABEL: multiple_use_fadd_multi_fmad_f16:
968955
; GFX11-DENORM-FAKE16: ; %bb.0:
969-
; GFX11-DENORM-FAKE16-NEXT: s_clause 0x2
970-
; GFX11-DENORM-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
971-
; GFX11-DENORM-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x8
972-
; GFX11-DENORM-FAKE16-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
956+
; GFX11-DENORM-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
973957
; GFX11-DENORM-FAKE16-NEXT: v_mov_b32_e32 v0, 0
974958
; GFX11-DENORM-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
975-
; GFX11-DENORM-FAKE16-NEXT: s_lshr_b32 s0, s0, 16
976-
; GFX11-DENORM-FAKE16-NEXT: v_fma_f16 v2, |s6|, 2.0, s1
977-
; GFX11-DENORM-FAKE16-NEXT: v_fma_f16 v1, |s6|, 2.0, s0
978-
; GFX11-DENORM-FAKE16-NEXT: global_store_b16 v0, v1, s[2:3] dlc
959+
; GFX11-DENORM-FAKE16-NEXT: s_lshr_b32 s4, s2, 16
960+
; GFX11-DENORM-FAKE16-NEXT: v_fma_f16 v2, |s2|, 2.0, s3
961+
; GFX11-DENORM-FAKE16-NEXT: v_fma_f16 v1, |s2|, 2.0, s4
962+
; GFX11-DENORM-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] dlc
979963
; GFX11-DENORM-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
980-
; GFX11-DENORM-FAKE16-NEXT: global_store_b16 v0, v2, s[2:3] offset:2 dlc
964+
; GFX11-DENORM-FAKE16-NEXT: global_store_b16 v0, v2, s[0:1] offset:2 dlc
981965
; GFX11-DENORM-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
982966
; GFX11-DENORM-FAKE16-NEXT: s_endpgm
983967
;
984968
; GFX11-FLUSH-TRUE16-LABEL: multiple_use_fadd_multi_fmad_f16:
985969
; GFX11-FLUSH-TRUE16: ; %bb.0:
986-
; GFX11-FLUSH-TRUE16-NEXT: s_clause 0x2
987-
; GFX11-FLUSH-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x8
988-
; GFX11-FLUSH-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
989-
; GFX11-FLUSH-TRUE16-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
970+
; GFX11-FLUSH-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
990971
; GFX11-FLUSH-TRUE16-NEXT: v_mov_b32_e32 v1, 0
991972
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
992-
; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e64 v0.l, |s6|, |s6|
993-
; GFX11-FLUSH-TRUE16-NEXT: s_lshr_b32 s0, s0, 16
973+
; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e64 v0.l, |s2|, |s2|
974+
; GFX11-FLUSH-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
994975
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
995-
; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.h, s0, v0.l
996-
; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, s1, v0.l
997-
; GFX11-FLUSH-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[2:3] dlc
976+
; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.h, s2, v0.l
977+
; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, s3, v0.l
978+
; GFX11-FLUSH-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1] dlc
998979
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
999-
; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[2:3] offset:2 dlc
980+
; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] offset:2 dlc
1000981
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
1001982
; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
1002983
;
1003984
; GFX11-FLUSH-FAKE16-LABEL: multiple_use_fadd_multi_fmad_f16:
1004985
; GFX11-FLUSH-FAKE16: ; %bb.0:
1005-
; GFX11-FLUSH-FAKE16-NEXT: s_clause 0x2
1006-
; GFX11-FLUSH-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x8
1007-
; GFX11-FLUSH-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
1008-
; GFX11-FLUSH-FAKE16-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
986+
; GFX11-FLUSH-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
1009987
; GFX11-FLUSH-FAKE16-NEXT: v_mov_b32_e32 v1, 0
1010988
; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
1011-
; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e64 v0, |s6|, |s6|
1012-
; GFX11-FLUSH-FAKE16-NEXT: s_lshr_b32 s0, s0, 16
989+
; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e64 v0, |s2|, |s2|
990+
; GFX11-FLUSH-FAKE16-NEXT: s_lshr_b32 s2, s2, 16
1013991
; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
1014-
; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v2, s0, v0
1015-
; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v0, s1, v0
1016-
; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v1, v2, s[2:3] dlc
992+
; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v2, s2, v0
993+
; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v0, s3, v0
994+
; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v1, v2, s[0:1] dlc
1017995
; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
1018-
; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v1, v0, s[2:3] offset:2 dlc
996+
; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] offset:2 dlc
1019997
; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
1020998
; GFX11-FLUSH-FAKE16-NEXT: s_endpgm
1021999
%x = bitcast i16 %x.arg to half

llvm/test/CodeGen/AMDGPU/mad_uint24.ll

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -133,11 +133,10 @@ define amdgpu_kernel void @i16_mad24(ptr addrspace(1) %out, i16 %a, i16 %b, i16
133133
; GCN-LABEL: i16_mad24:
134134
; GCN: ; %bb.0: ; %entry
135135
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
136-
; GCN-NEXT: s_load_dword s4, s[4:5], 0xb
137136
; GCN-NEXT: s_mov_b32 s7, 0xf000
138137
; GCN-NEXT: s_waitcnt lgkmcnt(0)
139-
; GCN-NEXT: s_lshr_b32 s2, s2, 16
140-
; GCN-NEXT: s_mul_i32 s2, s4, s2
138+
; GCN-NEXT: s_lshr_b32 s4, s2, 16
139+
; GCN-NEXT: s_mul_i32 s2, s2, s4
141140
; GCN-NEXT: s_add_i32 s2, s2, s3
142141
; GCN-NEXT: s_sext_i32_i16 s2, s2
143142
; GCN-NEXT: s_mov_b32 s6, -1
@@ -150,13 +149,12 @@ define amdgpu_kernel void @i16_mad24(ptr addrspace(1) %out, i16 %a, i16 %b, i16
150149
; GFX8-LABEL: i16_mad24:
151150
; GFX8: ; %bb.0: ; %entry
152151
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
153-
; GFX8-NEXT: s_load_dword s8, s[4:5], 0x2c
154152
; GFX8-NEXT: s_mov_b32 s7, 0xf000
155153
; GFX8-NEXT: s_mov_b32 s6, -1
156154
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
157155
; GFX8-NEXT: s_mov_b32 s4, s0
158156
; GFX8-NEXT: s_lshr_b32 s0, s2, 16
159-
; GFX8-NEXT: s_mul_i32 s0, s8, s0
157+
; GFX8-NEXT: s_mul_i32 s0, s2, s0
160158
; GFX8-NEXT: s_add_i32 s0, s0, s3
161159
; GFX8-NEXT: s_sext_i32_i16 s0, s0
162160
; GFX8-NEXT: s_mov_b32 s5, s1

llvm/test/CodeGen/AMDGPU/sad.ll

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -388,20 +388,18 @@ define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32
388388
define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 %b, i16 %c) {
389389
; GCN-LABEL: v_sad_u32_i16_pat1:
390390
; GCN: ; %bb.0:
391-
; GCN-NEXT: s_load_dword s4, s[8:9], 0x2
392-
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
393-
; GCN-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
391+
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
394392
; GCN-NEXT: s_add_i32 s12, s12, s17
395393
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
394+
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
396395
; GCN-NEXT: s_waitcnt lgkmcnt(0)
397-
; GCN-NEXT: s_and_b32 s4, s4, 0xffff
398-
; GCN-NEXT: s_lshr_b32 s0, s0, 16
399-
; GCN-NEXT: v_mov_b32_e32 v0, s1
400-
; GCN-NEXT: v_mov_b32_e32 v1, s0
396+
; GCN-NEXT: s_and_b32 s4, s2, 0xffff
397+
; GCN-NEXT: s_lshr_b32 s2, s2, 16
398+
; GCN-NEXT: v_mov_b32_e32 v0, s3
399+
; GCN-NEXT: v_mov_b32_e32 v1, s2
401400
; GCN-NEXT: v_sad_u32 v2, s4, v1, v0
402-
; GCN-NEXT: v_mov_b32_e32 v0, s2
403-
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
404-
; GCN-NEXT: v_mov_b32_e32 v1, s3
401+
; GCN-NEXT: v_mov_b32_e32 v0, s0
402+
; GCN-NEXT: v_mov_b32_e32 v1, s1
405403
; GCN-NEXT: flat_store_short v[0:1], v2
406404
; GCN-NEXT: s_endpgm
407405
%icmp0 = icmp ugt i16 %a, %b

0 commit comments

Comments
 (0)