Skip to content

Commit 3870b36

Browse files
committed
[AMDGPU] Split unaligned 3 DWORD DS operations
I have written a minitest to check the performance. Overall the benefit of aligned b96 operations on data which is not known but happens to be aligned is small, while performance hit of using b96 operations on a really unaligned memory is high. The only exception is when data is not aligned even by 4, it is better to use b96 in this case. Here is the test output on Vega and Navi: ``` Using platform: AMD Accelerated Parallel Processing Using device: gfx900:xnack- ds_write_b96 aligned: 3.4 sec ds_write_b32 + ds_write_b64 aligned: 4.5 sec ds_write_b32 * 3 aligned: 4.8 sec ds_write_b96 misaligned by 1: 4.8 sec ds_write_b32 + ds_write_b64 misaligned by 1: 7.2 sec ds_write_b32 * 3 misaligned by 1: 10.0 sec ds_write_b96 misaligned by 2: 4.8 sec ds_write_b32 + ds_write_b64 misaligned by 2: 7.2 sec ds_write_b32 * 3 misaligned by 2: 10.1 sec ds_write_b96 misaligned by 4: 4.8 sec ds_write_b32 + ds_write_b64 misaligned by 4: 4.2 sec ds_write_b32 * 3 misaligned by 4: 4.9 sec ds_write_b96 misaligned by 8: 4.8 sec ds_write_b32 + ds_write_b64 misaligned by 8: 4.6 sec ds_write_b32 * 3 misaligned by 8: 4.9 sec ds_read_b96 aligned: 3.3 sec ds_read_b32 + ds_read_b64 aligned: 4.9 sec ds_read_b32 * 3 aligned: 2.6 sec ds_read_b96 misaligned by 1: 4.1 sec ds_read_b32 + ds_read_b64 misaligned by 1: 7.2 sec ds_read_b32 * 3 misaligned by 1: 10.1 sec ds_read_b96 misaligned by 2: 4.1 sec ds_read_b32 + ds_read_b64 misaligned by 2: 7.2 sec ds_read_b32 * 3 misaligned by 2: 10.1 sec ds_read_b96 misaligned by 4: 4.1 sec ds_read_b32 + ds_read_b64 misaligned by 4: 2.6 sec ds_read_b32 * 3 misaligned by 4: 2.6 sec ds_read_b96 misaligned by 8: 4.1 sec ds_read_b32 + ds_read_b64 misaligned by 8: 4.9 sec ds_read_b32 * 3 misaligned by 8: 2.6 sec Using platform: AMD Accelerated Parallel Processing Using device: gfx1030 ds_write_b96 aligned: 4.1 sec ds_write_b32 + ds_write_b64 aligned: 13.0 sec ds_write_b32 * 3 aligned: 4.5 sec ds_write_b96 misaligned by 1: 12.5 sec ds_write_b32 + ds_write_b64 misaligned by 1: 22.0 sec ds_write_b32 * 3 misaligned by 1: 31.5 sec ds_write_b96 misaligned by 2: 12.4 sec ds_write_b32 + ds_write_b64 misaligned by 2: 22.0 sec ds_write_b32 * 3 misaligned by 2: 31.5 sec ds_write_b96 misaligned by 4: 12.4 sec ds_write_b32 + ds_write_b64 misaligned by 4: 4.0 sec ds_write_b32 * 3 misaligned by 4: 4.5 sec ds_write_b96 misaligned by 8: 12.4 sec ds_write_b32 + ds_write_b64 misaligned by 8: 13.0 sec ds_write_b32 * 3 misaligned by 8: 4.5 sec ds_read_b96 aligned: 3.8 sec ds_read_b32 + ds_read_b64 aligned: 12.8 sec ds_read_b32 * 3 aligned: 4.4 sec ds_read_b96 misaligned by 1: 10.9 sec ds_read_b32 + ds_read_b64 misaligned by 1: 21.8 sec ds_read_b32 * 3 misaligned by 1: 31.5 sec ds_read_b96 misaligned by 2: 10.9 sec ds_read_b32 + ds_read_b64 misaligned by 2: 21.9 sec ds_read_b32 * 3 misaligned by 2: 31.5 sec ds_read_b96 misaligned by 4: 10.9 sec ds_read_b32 + ds_read_b64 misaligned by 4: 3.8 sec ds_read_b32 * 3 misaligned by 4: 4.5 sec ds_read_b96 misaligned by 8: 10.9 sec ds_read_b32 + ds_read_b64 misaligned by 8: 12.8 sec ds_read_b32 * 3 misaligned by 8: 4.5 sec ``` Fixes: SWDEV-330802 Differential Revision: https://reviews.llvm.org/D123524
1 parent b8e09f1 commit 3870b36

File tree

4 files changed

+18
-31
lines changed

4 files changed

+18
-31
lines changed

llvm/lib/Target/AMDGPU/DSInstructions.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -877,8 +877,8 @@ defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align16_local">;
877877

878878
let SubtargetPredicate = HasUnalignedAccessMode in {
879879

880-
// FIXME: From performance point of view, is ds_read_b96/ds_write_b96 better choice
881-
// for unaligned accesses?
880+
// Selection will split most of the unaligned 3 dword accesses due to performance
881+
// reasons when beneficial. Keep these two patterns for the rest of the cases.
882882
foreach vt = VReg_96.RegTypes in {
883883
defm : DSReadPat_mc <DS_READ_B96, vt, "load_local">;
884884
defm : DSWritePat_mc <DS_WRITE_B96, vt, "store_local">;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1553,6 +1553,18 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
15531553
// 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
15541554
// gfx8 and older.
15551555
RequiredAlignment = Align(16);
1556+
1557+
if (Subtarget->hasUnalignedDSAccessEnabled()) {
1558+
// Naturally aligned access is fastest. However, also report it is Fast
1559+
// if memory is aligned less than DWORD. A narrow load or store will be
1560+
// be equally slow as a single ds_read_b96/ds_write_b96, but there will
1561+
// be more of them, so overall we will pay less penalty issuing a single
1562+
// instruction.
1563+
if (IsFast)
1564+
*IsFast = Alignment >= RequiredAlignment || Alignment < Align(4);
1565+
return true;
1566+
}
1567+
15561568
break;
15571569
case 128:
15581570
if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())

llvm/test/CodeGen/AMDGPU/ds-alignment.ll

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -566,23 +566,11 @@ define amdgpu_kernel void @ds12align4(<3 x i32> addrspace(3)* %in, <3 x i32> add
566566
; ALIGNED-NEXT: s_waitcnt lgkmcnt(1)
567567
; ALIGNED-NEXT: ds_write_b32 v3, v2 offset:8
568568
; ALIGNED-NEXT: s_endpgm
569-
;
570-
; UNALIGNED-LABEL: ds12align4:
571-
; UNALIGNED: ; %bb.0:
572-
; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
573-
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
574-
; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
575-
; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0
576-
; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1
577-
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
578-
; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2]
579-
; UNALIGNED-NEXT: s_endpgm
580569
%val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 4
581570
store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 4
582571
ret void
583572
}
584573

585-
; TODO: Why does the ALIGNED-SDAG code use ds_write_b64 but not ds_read_b64?
586574
define amdgpu_kernel void @ds12align8(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
587575
; ALIGNED-SDAG-LABEL: ds12align8:
588576
; ALIGNED-SDAG: ; %bb.0:
@@ -611,17 +599,6 @@ define amdgpu_kernel void @ds12align8(<3 x i32> addrspace(3)* %in, <3 x i32> add
611599
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1)
612600
; ALIGNED-GISEL-NEXT: ds_write_b32 v3, v2 offset:8
613601
; ALIGNED-GISEL-NEXT: s_endpgm
614-
;
615-
; UNALIGNED-LABEL: ds12align8:
616-
; UNALIGNED: ; %bb.0:
617-
; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
618-
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
619-
; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
620-
; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0
621-
; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1
622-
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
623-
; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2]
624-
; UNALIGNED-NEXT: s_endpgm
625602
%val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 8
626603
store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 8
627604
ret void

llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,10 @@ bb:
4747
}
4848

4949
; GCN-LABEL: test_local_misaligned_v3:
50-
; ALIGNED-DAG: ds_read2_b32
51-
; ALIGNED-DAG: ds_read_b32
52-
; ALIGNED-DAG: ds_write2_b32
53-
; ALIGNED-DAG: ds_write_b32
54-
; UNALIGNED-DAG: ds_read_b96
55-
; UNALIGNED-DAG: ds_write_b96
50+
; GCN-DAG: ds_read2_b32
51+
; GCN-DAG: ds_read_b32
52+
; GCN-DAG: ds_write2_b32
53+
; GCN-DAG: ds_write_b32
5654
define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) {
5755
bb:
5856
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()

0 commit comments

Comments
 (0)