[AMDGPU] Split unaligned 3 DWORD DS operations

rampitec · rampitec · commit 3870b3602552 · 2022-04-12T07:52:39.000-07:00
I have written a minitest to check the performance. Overall the benefit of aligned b96 operations on data which is not known but happens to be aligned is small, while performance hit of using b96 operations on a really unaligned memory is high. The only exception is when data is not aligned even by 4, it is better to use b96 in this case. Here is the test output on Vega and Navi: ``` Using platform: AMD Accelerated Parallel Processing Using device: gfx900:xnack- ds_write_b96 aligned: 3.4 sec ds_write_b32 + ds_write_b64 aligned: 4.5 sec ds_write_b32 * 3 aligned: 4.8 sec ds_write_b96 misaligned by 1: 4.8 sec ds_write_b32 + ds_write_b64 misaligned by 1: 7.2 sec ds_write_b32 * 3 misaligned by 1: 10.0 sec ds_write_b96 misaligned by 2: 4.8 sec ds_write_b32 + ds_write_b64 misaligned by 2: 7.2 sec ds_write_b32 * 3 misaligned by 2: 10.1 sec ds_write_b96 misaligned by 4: 4.8 sec ds_write_b32 + ds_write_b64 misaligned by 4: 4.2 sec ds_write_b32 * 3 misaligned by 4: 4.9 sec ds_write_b96 misaligned by 8: 4.8 sec ds_write_b32 + ds_write_b64 misaligned by 8: 4.6 sec ds_write_b32 * 3 misaligned by 8: 4.9 sec ds_read_b96 aligned: 3.3 sec ds_read_b32 + ds_read_b64 aligned: 4.9 sec ds_read_b32 * 3 aligned: 2.6 sec ds_read_b96 misaligned by 1: 4.1 sec ds_read_b32 + ds_read_b64 misaligned by 1: 7.2 sec ds_read_b32 * 3 misaligned by 1: 10.1 sec ds_read_b96 misaligned by 2: 4.1 sec ds_read_b32 + ds_read_b64 misaligned by 2: 7.2 sec ds_read_b32 * 3 misaligned by 2: 10.1 sec ds_read_b96 misaligned by 4: 4.1 sec ds_read_b32 + ds_read_b64 misaligned by 4: 2.6 sec ds_read_b32 * 3 misaligned by 4: 2.6 sec ds_read_b96 misaligned by 8: 4.1 sec ds_read_b32 + ds_read_b64 misaligned by 8: 4.9 sec ds_read_b32 * 3 misaligned by 8: 2.6 sec Using platform: AMD Accelerated Parallel Processing Using device: gfx1030 ds_write_b96 aligned: 4.1 sec ds_write_b32 + ds_write_b64 aligned: 13.0 sec ds_write_b32 * 3 aligned: 4.5 sec ds_write_b96 misaligned by 1: 12.5 sec ds_write_b32 + ds_write_b64 misaligned by 1: 22.0 sec ds_write_b32 * 3 misaligned by 1: 31.5 sec ds_write_b96 misaligned by 2: 12.4 sec ds_write_b32 + ds_write_b64 misaligned by 2: 22.0 sec ds_write_b32 * 3 misaligned by 2: 31.5 sec ds_write_b96 misaligned by 4: 12.4 sec ds_write_b32 + ds_write_b64 misaligned by 4: 4.0 sec ds_write_b32 * 3 misaligned by 4: 4.5 sec ds_write_b96 misaligned by 8: 12.4 sec ds_write_b32 + ds_write_b64 misaligned by 8: 13.0 sec ds_write_b32 * 3 misaligned by 8: 4.5 sec ds_read_b96 aligned: 3.8 sec ds_read_b32 + ds_read_b64 aligned: 12.8 sec ds_read_b32 * 3 aligned: 4.4 sec ds_read_b96 misaligned by 1: 10.9 sec ds_read_b32 + ds_read_b64 misaligned by 1: 21.8 sec ds_read_b32 * 3 misaligned by 1: 31.5 sec ds_read_b96 misaligned by 2: 10.9 sec ds_read_b32 + ds_read_b64 misaligned by 2: 21.9 sec ds_read_b32 * 3 misaligned by 2: 31.5 sec ds_read_b96 misaligned by 4: 10.9 sec ds_read_b32 + ds_read_b64 misaligned by 4: 3.8 sec ds_read_b32 * 3 misaligned by 4: 4.5 sec ds_read_b96 misaligned by 8: 10.9 sec ds_read_b32 + ds_read_b64 misaligned by 8: 12.8 sec ds_read_b32 * 3 misaligned by 8: 4.5 sec ``` Fixes: SWDEV-330802 Differential Revision: https://reviews.llvm.org/D123524
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -877,8 +877,8 @@ defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align16_local">;
 
 let SubtargetPredicate = HasUnalignedAccessMode in {
 
-// FIXME: From performance point of view, is ds_read_b96/ds_write_b96 better choice
-// for unaligned accesses?
+// Selection will split most of the unaligned 3 dword accesses due to performance
+// reasons when beneficial. Keep these two patterns for the rest of the cases.
 foreach vt = VReg_96.RegTypes in {
 defm : DSReadPat_mc <DS_READ_B96, vt, "load_local">;
 defm : DSWritePat_mc <DS_WRITE_B96, vt, "store_local">;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1553,6 +1553,18 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
       // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
       // gfx8 and older.
       RequiredAlignment = Align(16);
+
+      if (Subtarget->hasUnalignedDSAccessEnabled()) {
+        // Naturally aligned access is fastest. However, also report it is Fast
+        // if memory is aligned less than DWORD. A narrow load or store will be
+        // be equally slow as a single ds_read_b96/ds_write_b96, but there will
+        // be more of them, so overall we will pay less penalty issuing a single
+        // instruction.
+        if (IsFast)
+          *IsFast = Alignment >= RequiredAlignment || Alignment < Align(4);
+        return true;
+      }
+
       break;
     case 128:
       if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
@@ -566,23 +566,11 @@ define amdgpu_kernel void @ds12align4(<3 x i32> addrspace(3)* %in, <3 x i32> add
 ; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
 ; ALIGNED-NEXT:    ds_write_b32 v3, v2 offset:8
 ; ALIGNED-NEXT:    s_endpgm
-;
-; UNALIGNED-LABEL: ds12align4:
-; UNALIGNED:       ; %bb.0:
-; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
-; UNALIGNED-NEXT:    ds_read_b96 v[0:2], v0
-; UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
-; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
-; UNALIGNED-NEXT:    s_endpgm
   %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 4
   store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 4
   ret void
 }
 
-; TODO: Why does the ALIGNED-SDAG code use ds_write_b64 but not ds_read_b64?
 define amdgpu_kernel void @ds12align8(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
 ; ALIGNED-SDAG-LABEL: ds12align8:
 ; ALIGNED-SDAG:       ; %bb.0:
@@ -611,17 +599,6 @@ define amdgpu_kernel void @ds12align8(<3 x i32> addrspace(3)* %in, <3 x i32> add
 ; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
 ; ALIGNED-GISEL-NEXT:    ds_write_b32 v3, v2 offset:8
 ; ALIGNED-GISEL-NEXT:    s_endpgm
-;
-; UNALIGNED-LABEL: ds12align8:
-; UNALIGNED:       ; %bb.0:
-; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
-; UNALIGNED-NEXT:    ds_read_b96 v[0:2], v0
-; UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
-; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
-; UNALIGNED-NEXT:    s_endpgm
   %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 8
   store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 8
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
@@ -47,12 +47,10 @@ bb:
 }
 
 ; GCN-LABEL: test_local_misaligned_v3:
-; ALIGNED-DAG: ds_read2_b32
-; ALIGNED-DAG: ds_read_b32
-; ALIGNED-DAG: ds_write2_b32
-; ALIGNED-DAG: ds_write_b32
-; UNALIGNED-DAG: ds_read_b96
-; UNALIGNED-DAG: ds_write_b96
+; GCN-DAG: ds_read2_b32
+; GCN-DAG: ds_read_b32
+; GCN-DAG: ds_write2_b32
+; GCN-DAG: ds_write_b32
 define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) {
 bb:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()