Skip to content

Commit b8e09f1

Browse files
committed
[AMDGPU] Refactor LDS alignment checks.
Move features/bugs checks into the single place allowsMisalignedMemoryAccessesImpl. This is mostly NFCI except for the order of selection in couple places. A separate change may be needed to stop lying about Fast. Differential Revision: https://reviews.llvm.org/D123343
1 parent 0488c66 commit b8e09f1

File tree

3 files changed

+77
-103
lines changed

3 files changed

+77
-103
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 47 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1516,25 +1516,23 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
15161516

15171517
if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
15181518
AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1519+
// Check if alignment requirements for ds_read/write instructions are
1520+
// disabled.
1521+
if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1522+
return false;
1523+
15191524
Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment.
15201525
if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
15211526
Alignment < RequiredAlignment)
15221527
return false;
15231528

1524-
// Check if alignment requirements for ds_read/write instructions are
1525-
// disabled.
1526-
if (Subtarget->hasUnalignedDSAccessEnabled()) {
1527-
if (IsFast)
1528-
*IsFast = Alignment != Align(2);
1529-
return true;
1530-
}
1531-
15321529
// Either, the alignment requirements are "enabled", or there is an
15331530
// unaligned LDS access related hardware bug though alignment requirements
15341531
// are "disabled". In either case, we need to check for proper alignment
15351532
// requirements.
15361533
//
1537-
if (Size == 64) {
1534+
switch (Size) {
1535+
case 64:
15381536
// SI has a hardware bug in the LDS / GDS bounds checking: if the base
15391537
// address is negative, then the instruction is incorrectly treated as
15401538
// out-of-bounds even if base + offsets is in bounds. Split vectorized
@@ -1546,31 +1544,41 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
15461544
// 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
15471545
// can do a 4 byte aligned, 8 byte access in a single operation using
15481546
// ds_read2/write2_b32 with adjacent offsets.
1549-
bool AlignedBy4 = Alignment >= Align(4);
1550-
if (IsFast)
1551-
*IsFast = AlignedBy4;
1547+
RequiredAlignment = Align(4);
1548+
break;
1549+
case 96:
1550+
if (!Subtarget->hasDS96AndDS128())
1551+
return false;
15521552

1553-
return AlignedBy4;
1554-
}
1555-
if (Size == 96) {
15561553
// 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
15571554
// gfx8 and older.
1558-
bool AlignedBy16 = Alignment >= Align(16);
1559-
if (IsFast)
1560-
*IsFast = AlignedBy16;
1555+
RequiredAlignment = Align(16);
1556+
break;
1557+
case 128:
1558+
if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1559+
return false;
15611560

1562-
return AlignedBy16;
1563-
}
1564-
if (Size == 128) {
15651561
// 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
15661562
// gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
15671563
// single operation using ds_read2/write2_b64.
1568-
bool AlignedBy8 = Alignment >= Align(8);
1569-
if (IsFast)
1570-
*IsFast = AlignedBy8;
1564+
RequiredAlignment = Align(8);
1565+
break;
1566+
default:
1567+
if (Size > 32)
1568+
return false;
15711569

1572-
return AlignedBy8;
1570+
break;
15731571
}
1572+
1573+
if (IsFast) {
1574+
// FIXME: Lie it is fast if +unaligned-access-mode is passed so that
1575+
// DS accesses get vectorized.
1576+
*IsFast = Alignment >= RequiredAlignment ||
1577+
Subtarget->hasUnalignedDSAccessEnabled();
1578+
}
1579+
1580+
return Alignment >= RequiredAlignment ||
1581+
Subtarget->hasUnalignedDSAccessEnabled();
15741582
}
15751583

15761584
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
@@ -1595,9 +1603,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
15951603
return AlignedBy4;
15961604
}
15971605

1598-
if (Subtarget->hasUnalignedBufferAccessEnabled() &&
1599-
!(AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1600-
AddrSpace == AMDGPUAS::REGION_ADDRESS)) {
1606+
if (Subtarget->hasUnalignedBufferAccessEnabled()) {
16011607
// If we have a uniform constant load, it still requires using a slow
16021608
// buffer instruction if unaligned.
16031609
if (IsFast) {
@@ -8519,27 +8525,15 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
85198525
llvm_unreachable("unsupported private_element_size");
85208526
}
85218527
} else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
8522-
// Use ds_read_b128 or ds_read_b96 when possible.
8523-
if (Subtarget->hasDS96AndDS128() &&
8524-
((Subtarget->useDS128() && MemVT.getStoreSize() == 16) ||
8525-
MemVT.getStoreSize() == 12) &&
8526-
allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS,
8527-
Load->getAlign()))
8528+
bool Fast = false;
8529+
auto Flags = Load->getMemOperand()->getFlags();
8530+
if (allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS,
8531+
Load->getAlign(), Flags, &Fast) &&
8532+
Fast)
85288533
return SDValue();
85298534

8530-
if (NumElements > 2)
8535+
if (MemVT.isVector())
85318536
return SplitVectorLoad(Op, DAG);
8532-
8533-
// SI has a hardware bug in the LDS / GDS bounds checking: if the base
8534-
// address is negative, then the instruction is incorrectly treated as
8535-
// out-of-bounds even if base + offsets is in bounds. Split vectorized
8536-
// loads here to avoid emitting ds_read2_b32. We may re-combine the
8537-
// load later in the SILoadStoreOptimizer.
8538-
if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
8539-
NumElements == 2 && MemVT.getStoreSize() == 8 &&
8540-
Load->getAlignment() < 8) {
8541-
return SplitVectorLoad(Op, DAG);
8542-
}
85438537
}
85448538

85458539
if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
@@ -9030,36 +9024,17 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
90309024
llvm_unreachable("unsupported private_element_size");
90319025
}
90329026
} else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
9033-
// Use ds_write_b128 or ds_write_b96 when possible.
9034-
if (Subtarget->hasDS96AndDS128() &&
9035-
((Subtarget->useDS128() && VT.getStoreSize() == 16) ||
9036-
(VT.getStoreSize() == 12)) &&
9037-
allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS,
9038-
Store->getAlign()))
9027+
bool Fast = false;
9028+
auto Flags = Store->getMemOperand()->getFlags();
9029+
if (allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS,
9030+
Store->getAlign(), Flags, &Fast) &&
9031+
Fast)
90399032
return SDValue();
90409033

9041-
if (NumElements > 2)
9034+
if (VT.isVector())
90429035
return SplitVectorStore(Op, DAG);
90439036

9044-
// SI has a hardware bug in the LDS / GDS bounds checking: if the base
9045-
// address is negative, then the instruction is incorrectly treated as
9046-
// out-of-bounds even if base + offsets is in bounds. Split vectorized
9047-
// stores here to avoid emitting ds_write2_b32. We may re-combine the
9048-
// store later in the SILoadStoreOptimizer.
9049-
if (!Subtarget->hasUsableDSOffset() &&
9050-
NumElements == 2 && VT.getStoreSize() == 8 &&
9051-
Store->getAlignment() < 8) {
9052-
return SplitVectorStore(Op, DAG);
9053-
}
9054-
9055-
if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
9056-
VT, *Store->getMemOperand())) {
9057-
if (VT.isVector())
9058-
return SplitVectorStore(Op, DAG);
9059-
return expandUnalignedStore(Store, DAG);
9060-
}
9061-
9062-
return SDValue();
9037+
return expandUnalignedStore(Store, DAG);
90639038
} else {
90649039
llvm_unreachable("unhandled address space");
90659040
}

llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll

Lines changed: 25 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -66,41 +66,40 @@ define amdgpu_vs void @test_3(i32 inreg %arg1, i32 inreg %arg2, <4 x i32> inreg
6666
; CHECK-NEXT: s_mov_b32 s6, s4
6767
; CHECK-NEXT: s_mov_b32 s5, s3
6868
; CHECK-NEXT: s_mov_b32 s4, s2
69-
; CHECK-NEXT: v_add_i32_e32 v0, vcc, 4, v1
69+
; CHECK-NEXT: v_add_i32_e32 v0, vcc, 20, v1
70+
; CHECK-NEXT: v_add_i32_e32 v3, vcc, 16, v1
71+
; CHECK-NEXT: v_add_i32_e32 v4, vcc, 12, v1
7072
; CHECK-NEXT: v_add_i32_e32 v5, vcc, 8, v1
71-
; CHECK-NEXT: v_add_i32_e32 v6, vcc, 12, v1
72-
; CHECK-NEXT: v_add_i32_e32 v7, vcc, 16, v1
73-
; CHECK-NEXT: v_add_i32_e32 v8, vcc, 20, v1
73+
; CHECK-NEXT: v_add_i32_e32 v8, vcc, 4, v1
7474
; CHECK-NEXT: v_mov_b32_e32 v9, s0
75-
; CHECK-NEXT: v_add_i32_e32 v10, vcc, 4, v2
76-
; CHECK-NEXT: v_add_i32_e32 v11, vcc, 8, v2
77-
; CHECK-NEXT: v_add_i32_e32 v12, vcc, 12, v2
75+
; CHECK-NEXT: v_add_i32_e32 v10, vcc, 20, v2
76+
; CHECK-NEXT: v_add_i32_e32 v11, vcc, 16, v2
7877
; CHECK-NEXT: s_mov_b32 m0, -1
79-
; CHECK-NEXT: ds_read_b32 v3, v1
80-
; CHECK-NEXT: ds_read_b32 v4, v0
78+
; CHECK-NEXT: ds_read_b32 v7, v3
79+
; CHECK-NEXT: ds_read_b32 v6, v4
8180
; CHECK-NEXT: ds_read_b32 v5, v5
82-
; CHECK-NEXT: ds_read_b32 v6, v6
83-
; CHECK-NEXT: ds_read_b32 v0, v7
84-
; CHECK-NEXT: ds_read_b32 v1, v8
85-
; CHECK-NEXT: v_add_i32_e32 v7, vcc, 16, v2
86-
; CHECK-NEXT: v_add_i32_e32 v8, vcc, 20, v2
87-
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
88-
; CHECK-NEXT: tbuffer_store_format_xyzw v[3:6], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc
81+
; CHECK-NEXT: ds_read_b32 v4, v8
82+
; CHECK-NEXT: ds_read_b32 v8, v0
83+
; CHECK-NEXT: ds_read_b32 v3, v1
84+
; CHECK-NEXT: v_add_i32_e32 v1, vcc, 12, v2
85+
; CHECK-NEXT: v_add_i32_e32 v12, vcc, 8, v2
86+
; CHECK-NEXT: v_add_i32_e32 v13, vcc, 4, v2
8987
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
90-
; CHECK-NEXT: tbuffer_store_format_xy v[0:1], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc
91-
; CHECK-NEXT: s_waitcnt expcnt(0)
92-
; CHECK-NEXT: ds_read_b32 v0, v2
88+
; CHECK-NEXT: tbuffer_store_format_xyzw v[3:6], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc
89+
; CHECK-NEXT: tbuffer_store_format_xy v[7:8], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc
90+
; CHECK-NEXT: ds_read_b32 v0, v11
91+
; CHECK-NEXT: s_waitcnt expcnt(1)
92+
; CHECK-NEXT: ds_read_b32 v5, v1
93+
; CHECK-NEXT: ds_read_b32 v4, v12
94+
; CHECK-NEXT: ds_read_b32 v3, v13
95+
; CHECK-NEXT: ds_read_b32 v2, v2
9396
; CHECK-NEXT: ds_read_b32 v1, v10
94-
; CHECK-NEXT: ds_read_b32 v2, v11
95-
; CHECK-NEXT: ds_read_b32 v3, v12
96-
; CHECK-NEXT: ds_read_b32 v4, v7
97-
; CHECK-NEXT: ds_read_b32 v5, v8
9897
; CHECK-NEXT: s_waitcnt lgkmcnt(5)
9998
; CHECK-NEXT: exp mrt0 off, off, off, off
100-
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
101-
; CHECK-NEXT: tbuffer_store_format_xyzw v[0:3], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc
99+
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
100+
; CHECK-NEXT: tbuffer_store_format_xyzw v[2:5], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc
102101
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
103-
; CHECK-NEXT: tbuffer_store_format_xy v[4:5], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc
102+
; CHECK-NEXT: tbuffer_store_format_xy v[0:1], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc
104103
; CHECK-NEXT: s_endpgm
105104
%load1 = load <6 x float>, <6 x float> addrspace(3)* %arg5, align 4
106105
%vec11 = shufflevector <6 x float> %load1, <6 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>

llvm/test/CodeGen/AMDGPU/store-local.128.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -462,12 +462,12 @@ define amdgpu_kernel void @store_lds_v4i32_align8(<4 x i32> addrspace(3)* %out,
462462
; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0
463463
; GFX6-NEXT: s_mov_b32 m0, -1
464464
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
465-
; GFX6-NEXT: v_mov_b32_e32 v0, s6
466-
; GFX6-NEXT: v_mov_b32_e32 v1, s7
465+
; GFX6-NEXT: v_mov_b32_e32 v0, s4
466+
; GFX6-NEXT: v_mov_b32_e32 v1, s5
467467
; GFX6-NEXT: v_mov_b32_e32 v4, s0
468-
; GFX6-NEXT: v_mov_b32_e32 v2, s4
469-
; GFX6-NEXT: v_mov_b32_e32 v3, s5
470-
; GFX6-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1
468+
; GFX6-NEXT: v_mov_b32_e32 v2, s6
469+
; GFX6-NEXT: v_mov_b32_e32 v3, s7
470+
; GFX6-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
471471
; GFX6-NEXT: s_endpgm
472472
;
473473
; GFX10-LABEL: store_lds_v4i32_align8:

0 commit comments

Comments
 (0)