Skip to content

Commit 9d55b17

Browse files
committed
[LSV] Check for power-of-two width
1 parent 869a0e4 commit 9d55b17

28 files changed

+3123
-2722
lines changed

llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -659,11 +659,20 @@ bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
659659
if (!AllElemsMatchTotalBits(C1) || !AllElemsMatchTotalBits(C2))
660660
continue;
661661

662-
// Rebase C2's offsets into C1's coordinate space prior to merging.
663-
rebaseChain(C2, *DeltaOpt);
662+
// Power-of-two span ensures we can form a legal, single vector access
663+
// without padding or splitting. Many targets and cost models assume POT
664+
// widths, and it guarantees an integral element count for the chosen
665+
// VecElemTy.
666+
APInt Sz = C2.front().OffsetFromLeader +
667+
DL.getTypeStoreSize(getLoadStoreType(C2.front().Inst)) -
668+
C1.back().OffsetFromLeader + *DeltaOpt;
669+
if (!Sz.isPowerOf2())
670+
continue;
664671

665-
// Merge C2 into C1 by appending all elements of C2 to C1, then erase C2
672+
// Rebase C2's offsets into C1's coordinate space prior to merging and
673+
// merge C2 into C1 by appending all elements of C2 to C1, then erase C2
666674
// from ContiguousSubChains.
675+
rebaseChain(C2, *DeltaOpt);
667676
C1.insert(C1.end(), C2.begin(), C2.end());
668677
ContiguousSubChains.erase(ContiguousSubChains.begin() + I);
669678

llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -298,24 +298,26 @@ define amdgpu_kernel void @local_address_store(ptr addrspace(3) %out, i32 %val)
298298
define amdgpu_kernel void @local_address_gep_store(ptr addrspace(3) %out, i32, i32 %val, i32 %offset) {
299299
; GFX7-LABEL: local_address_gep_store:
300300
; GFX7: ; %bb.0:
301-
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
301+
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
302+
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x9
302303
; GFX7-NEXT: s_mov_b32 m0, -1
303304
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
304-
; GFX7-NEXT: s_lshl_b32 s2, s2, 2
305-
; GFX7-NEXT: s_add_i32 s0, s0, s2
306-
; GFX7-NEXT: v_mov_b32_e32 v0, s1
305+
; GFX7-NEXT: s_lshl_b32 s1, s1, 2
306+
; GFX7-NEXT: v_mov_b32_e32 v0, s0
307+
; GFX7-NEXT: s_add_i32 s0, s2, s1
307308
; GFX7-NEXT: v_mov_b32_e32 v1, s0
308309
; GFX7-NEXT: ds_write_b32 v1, v0
309310
; GFX7-NEXT: s_endpgm
310311
;
311312
; GFX8-LABEL: local_address_gep_store:
312313
; GFX8: ; %bb.0:
313-
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
314+
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
315+
; GFX8-NEXT: s_load_dword s2, s[4:5], 0x24
314316
; GFX8-NEXT: s_mov_b32 m0, -1
315317
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
316-
; GFX8-NEXT: s_lshl_b32 s2, s2, 2
317-
; GFX8-NEXT: s_add_i32 s0, s0, s2
318-
; GFX8-NEXT: v_mov_b32_e32 v0, s1
318+
; GFX8-NEXT: s_lshl_b32 s1, s1, 2
319+
; GFX8-NEXT: v_mov_b32_e32 v0, s0
320+
; GFX8-NEXT: s_add_i32 s0, s2, s1
319321
; GFX8-NEXT: v_mov_b32_e32 v1, s0
320322
; GFX8-NEXT: ds_write_b32 v1, v0
321323
; GFX8-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll

Lines changed: 294 additions & 222 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll

Lines changed: 49 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -513,13 +513,15 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(ptr addrspace(1) %out,
513513
define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out, float %a, float %b, float %c, [8 x i32], i1 %d) {
514514
; GFX7-LABEL: test_div_fmas_f32_inline_imm_1:
515515
; GFX7: ; %bb.0:
516-
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
517-
; GFX7-NEXT: s_load_dword s4, s[4:5], 0xd
516+
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
517+
; GFX7-NEXT: s_load_dword s3, s[4:5], 0x4
518+
; GFX7-NEXT: s_load_dword s6, s[4:5], 0xd
519+
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
518520
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
519-
; GFX7-NEXT: v_mov_b32_e32 v1, s3
520-
; GFX7-NEXT: s_and_b32 s3, 1, s4
521521
; GFX7-NEXT: v_mov_b32_e32 v0, s2
522-
; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s3
522+
; GFX7-NEXT: v_mov_b32_e32 v1, s3
523+
; GFX7-NEXT: s_and_b32 s2, 1, s6
524+
; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
523525
; GFX7-NEXT: s_mov_b32 s2, -1
524526
; GFX7-NEXT: s_mov_b32 s3, 0xf000
525527
; GFX7-NEXT: s_nop 1
@@ -529,73 +531,84 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out,
529531
;
530532
; GFX8-LABEL: test_div_fmas_f32_inline_imm_1:
531533
; GFX8: ; %bb.0:
532-
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
533-
; GFX8-NEXT: s_load_dword s4, s[4:5], 0x34
534+
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x8
535+
; GFX8-NEXT: s_load_dword s1, s[4:5], 0x10
536+
; GFX8-NEXT: s_load_dword s2, s[4:5], 0x34
534537
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
535-
; GFX8-NEXT: v_mov_b32_e32 v0, s2
536-
; GFX8-NEXT: s_and_b32 s2, 1, s4
537-
; GFX8-NEXT: v_mov_b32_e32 v1, s3
538-
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
539-
; GFX8-NEXT: s_nop 3
538+
; GFX8-NEXT: v_mov_b32_e32 v0, s0
539+
; GFX8-NEXT: v_mov_b32_e32 v1, s1
540+
; GFX8-NEXT: s_and_b32 s0, 1, s2
541+
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
542+
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
543+
; GFX8-NEXT: s_nop 2
540544
; GFX8-NEXT: v_div_fmas_f32 v2, v0, 1.0, v1
545+
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
541546
; GFX8-NEXT: v_mov_b32_e32 v0, s0
542547
; GFX8-NEXT: v_mov_b32_e32 v1, s1
543548
; GFX8-NEXT: flat_store_dword v[0:1], v2
544549
; GFX8-NEXT: s_endpgm
545550
;
546551
; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_1:
547552
; GFX10_W32: ; %bb.0:
548-
; GFX10_W32-NEXT: s_clause 0x1
549-
; GFX10_W32-NEXT: s_load_dword s6, s[4:5], 0x34
550-
; GFX10_W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
553+
; GFX10_W32-NEXT: s_clause 0x3
554+
; GFX10_W32-NEXT: s_load_dword s2, s[4:5], 0x34
555+
; GFX10_W32-NEXT: s_load_dword s3, s[4:5], 0x10
556+
; GFX10_W32-NEXT: s_load_dword s6, s[4:5], 0x8
557+
; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
551558
; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0
552559
; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
553-
; GFX10_W32-NEXT: s_and_b32 s4, 1, s6
560+
; GFX10_W32-NEXT: s_and_b32 s2, 1, s2
554561
; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3
555-
; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
556-
; GFX10_W32-NEXT: v_div_fmas_f32 v0, s2, 1.0, v0
562+
; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2
563+
; GFX10_W32-NEXT: v_div_fmas_f32 v0, s6, 1.0, v0
557564
; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1]
558565
; GFX10_W32-NEXT: s_endpgm
559566
;
560567
; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_1:
561568
; GFX10_W64: ; %bb.0:
562-
; GFX10_W64-NEXT: s_clause 0x1
563-
; GFX10_W64-NEXT: s_load_dword s6, s[4:5], 0x34
564-
; GFX10_W64-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
569+
; GFX10_W64-NEXT: s_clause 0x3
570+
; GFX10_W64-NEXT: s_load_dword s2, s[4:5], 0x34
571+
; GFX10_W64-NEXT: s_load_dword s3, s[4:5], 0x10
572+
; GFX10_W64-NEXT: s_load_dword s6, s[4:5], 0x8
573+
; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
565574
; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0
566575
; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
567-
; GFX10_W64-NEXT: s_and_b32 s4, 1, s6
576+
; GFX10_W64-NEXT: s_and_b32 s2, 1, s2
568577
; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3
569-
; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
570-
; GFX10_W64-NEXT: v_div_fmas_f32 v0, s2, 1.0, v0
578+
; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
579+
; GFX10_W64-NEXT: v_div_fmas_f32 v0, s6, 1.0, v0
571580
; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1]
572581
; GFX10_W64-NEXT: s_endpgm
573582
;
574583
; GFX11_W32-LABEL: test_div_fmas_f32_inline_imm_1:
575584
; GFX11_W32: ; %bb.0:
576-
; GFX11_W32-NEXT: s_clause 0x1
577-
; GFX11_W32-NEXT: s_load_b32 s6, s[4:5], 0x34
578-
; GFX11_W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
585+
; GFX11_W32-NEXT: s_clause 0x3
586+
; GFX11_W32-NEXT: s_load_b32 s2, s[4:5], 0x34
587+
; GFX11_W32-NEXT: s_load_b32 s3, s[4:5], 0x10
588+
; GFX11_W32-NEXT: s_load_b32 s6, s[4:5], 0x8
589+
; GFX11_W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
579590
; GFX11_W32-NEXT: v_mov_b32_e32 v1, 0
580591
; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0)
581-
; GFX11_W32-NEXT: s_and_b32 s4, 1, s6
592+
; GFX11_W32-NEXT: s_and_b32 s2, 1, s2
582593
; GFX11_W32-NEXT: v_mov_b32_e32 v0, s3
583-
; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
584-
; GFX11_W32-NEXT: v_div_fmas_f32 v0, s2, 1.0, v0
594+
; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2
595+
; GFX11_W32-NEXT: v_div_fmas_f32 v0, s6, 1.0, v0
585596
; GFX11_W32-NEXT: global_store_b32 v1, v0, s[0:1]
586597
; GFX11_W32-NEXT: s_endpgm
587598
;
588599
; GFX11_W64-LABEL: test_div_fmas_f32_inline_imm_1:
589600
; GFX11_W64: ; %bb.0:
590-
; GFX11_W64-NEXT: s_clause 0x1
591-
; GFX11_W64-NEXT: s_load_b32 s6, s[4:5], 0x34
592-
; GFX11_W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
601+
; GFX11_W64-NEXT: s_clause 0x3
602+
; GFX11_W64-NEXT: s_load_b32 s2, s[4:5], 0x34
603+
; GFX11_W64-NEXT: s_load_b32 s3, s[4:5], 0x10
604+
; GFX11_W64-NEXT: s_load_b32 s6, s[4:5], 0x8
605+
; GFX11_W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
593606
; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0
594607
; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0)
595-
; GFX11_W64-NEXT: s_and_b32 s4, 1, s6
608+
; GFX11_W64-NEXT: s_and_b32 s2, 1, s2
596609
; GFX11_W64-NEXT: v_mov_b32_e32 v0, s3
597-
; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
598-
; GFX11_W64-NEXT: v_div_fmas_f32 v0, s2, 1.0, v0
610+
; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
611+
; GFX11_W64-NEXT: v_div_fmas_f32 v0, s6, 1.0, v0
599612
; GFX11_W64-NEXT: global_store_b32 v1, v0, s[0:1]
600613
; GFX11_W64-NEXT: s_endpgm
601614
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d)

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -684,14 +684,15 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out
684684
;
685685
; GFX10-LABEL: test_div_scale_f64_scalar_num_1:
686686
; GFX10: ; %bb.0:
687+
; GFX10-NEXT: s_clause 0x1
687688
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
689+
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x54
688690
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
689-
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x54
690691
; GFX10-NEXT: v_mov_b32_e32 v2, 0
691692
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
692693
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
693694
; GFX10-NEXT: s_waitcnt vmcnt(0)
694-
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[0:1], s[4:5]
695+
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[0:1], s[6:7]
695696
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
696697
; GFX10-NEXT: s_endpgm
697698
;
@@ -759,14 +760,15 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out
759760
;
760761
; GFX10-LABEL: test_div_scale_f64_scalar_num_2:
761762
; GFX10: ; %bb.0:
763+
; GFX10-NEXT: s_clause 0x1
762764
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
765+
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x54
763766
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
764-
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x54
765767
; GFX10-NEXT: v_mov_b32_e32 v2, 0
766768
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
767769
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
768770
; GFX10-NEXT: s_waitcnt vmcnt(0)
769-
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[4:5], v[0:1], s[4:5]
771+
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[6:7], v[0:1], s[6:7]
770772
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
771773
; GFX10-NEXT: s_endpgm
772774
;
@@ -834,14 +836,15 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out
834836
;
835837
; GFX10-LABEL: test_div_scale_f64_scalar_den_1:
836838
; GFX10: ; %bb.0:
839+
; GFX10-NEXT: s_clause 0x1
837840
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
841+
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x54
838842
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
839-
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x54
840843
; GFX10-NEXT: v_mov_b32_e32 v2, 0
841844
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
842845
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
843846
; GFX10-NEXT: s_waitcnt vmcnt(0)
844-
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[4:5], s[4:5], v[0:1]
847+
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[6:7], s[6:7], v[0:1]
845848
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
846849
; GFX10-NEXT: s_endpgm
847850
;
@@ -909,14 +912,15 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out
909912
;
910913
; GFX10-LABEL: test_div_scale_f64_scalar_den_2:
911914
; GFX10: ; %bb.0:
915+
; GFX10-NEXT: s_clause 0x1
912916
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
917+
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x54
913918
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
914-
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x54
915919
; GFX10-NEXT: v_mov_b32_e32 v2, 0
916920
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
917921
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
918922
; GFX10-NEXT: s_waitcnt vmcnt(0)
919-
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], s[4:5], v[0:1]
923+
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], s[6:7], v[0:1]
920924
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
921925
; GFX10-NEXT: s_endpgm
922926
;

llvm/test/CodeGen/AMDGPU/add_i64.ll

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -142,11 +142,12 @@ define amdgpu_kernel void @test_v2i64_vreg(ptr addrspace(1) noalias %out, ptr ad
142142
define amdgpu_kernel void @trunc_i64_add_to_i32(ptr addrspace(1) %out, i32, i64 %a, i32, i64 %b) {
143143
; SI-LABEL: trunc_i64_add_to_i32:
144144
; SI: ; %bb.0:
145-
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
146-
; SI-NEXT: s_load_dword s4, s[4:5], 0x11
147-
; SI-NEXT: s_waitcnt lgkmcnt(0)
145+
; SI-NEXT: s_load_dword s2, s[4:5], 0xd
146+
; SI-NEXT: s_load_dword s6, s[4:5], 0x11
147+
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
148148
; SI-NEXT: s_mov_b32 s3, 0xf000
149-
; SI-NEXT: s_add_i32 s4, s4, s2
149+
; SI-NEXT: s_waitcnt lgkmcnt(0)
150+
; SI-NEXT: s_add_i32 s4, s6, s2
150151
; SI-NEXT: s_mov_b32 s2, -1
151152
; SI-NEXT: v_mov_b32_e32 v0, s4
152153
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0

0 commit comments

Comments
 (0)