Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,12 @@ def FeatureUnalignedDSAccess : SubtargetFeature<"unaligned-ds-access",
"Hardware supports unaligned local and region loads and stores"
>;

def FeatureRequireNaturallyAlignedBufferAccess : SubtargetFeature<"require-naturally-aligned-buffer-access",
"RequireNaturallyAlignedBufferAccess",
"true",
"Requires natural alignment of buffer accesses"
>;

def FeatureApertureRegs : SubtargetFeature<"aperture-regs",
"HasApertureRegs",
"true",
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool BackOffBarrier = false;
bool UnalignedScratchAccess = false;
bool UnalignedAccessMode = false;
bool RequireNaturallyAlignedBufferAccess = false;
bool HasApertureRegs = false;
bool SupportsXNACK = false;
bool KernargPreload = false;
Expand Down Expand Up @@ -600,6 +601,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return UnalignedAccessMode;
}

bool hasRequireNaturallyAlignedBufferAccess() const {
return RequireNaturallyAlignedBufferAccess;
}

bool hasApertureRegs() const {
return HasApertureRegs;
}
Expand Down
14 changes: 14 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1840,6 +1840,20 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
Subtarget->hasUnalignedBufferAccessEnabled();
}

// Check natural alignment of buffer if the target requires it. This is needed
// only if robust out-of-bounds guarantees are needed. Normally hardware will
// ensure proper out-of-bounds behavior, but in the edge case where an access
// starts out-of-bounds and then enter in-bounds, the entire access would be
// treated as out-of-bounds. Requiring the natural alignment avoids the
// problem.
if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
if (Subtarget->hasRequireNaturallyAlignedBufferAccess() &&
Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
return false;
}

// Smaller than dword value must be aligned.
if (Size < 32)
return false;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -mtriple=amdgcn--amdpal -passes=load-store-vectorizer -mattr=+require-naturally-aligned-buffer-access -S -o - %s | FileCheck --check-prefix=ALIGNED %s
; RUN: opt -mtriple=amdgcn--amdpal -passes=load-store-vectorizer -S -o - %s | FileCheck --check-prefixes=UNALIGNED %s

; The test checks that require-naturally-aligned-buffer-access target feature prevents merging loads if the target load would not be naturally aligned.

define amdgpu_kernel void @merge_align_4(ptr addrspace(7) nocapture %p, ptr addrspace(7) nocapture %p2) #0 {
;
; ALIGNED-LABEL: define amdgpu_kernel void @merge_align_4(
; ALIGNED-SAME: ptr addrspace(7) nocapture [[P:%.*]], ptr addrspace(7) nocapture [[P2:%.*]]) #[[ATTR0:[0-9]+]] {
; ALIGNED-NEXT: [[ENTRY:.*:]]
; ALIGNED-NEXT: [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
; ALIGNED-NEXT: [[LD_M8:%.*]] = load i32, ptr addrspace(7) [[GEP_M8]], align 4
; ALIGNED-NEXT: [[GEP_M4:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -4
; ALIGNED-NEXT: [[LD_M4:%.*]] = load i32, ptr addrspace(7) [[GEP_M4]], align 4
; ALIGNED-NEXT: [[GEP_0:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 0
; ALIGNED-NEXT: [[LD_0:%.*]] = load i32, ptr addrspace(7) [[GEP_0]], align 4
; ALIGNED-NEXT: [[GEP_4:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i64 4
; ALIGNED-NEXT: [[LD_4:%.*]] = load i32, ptr addrspace(7) [[GEP_4]], align 4
; ALIGNED-NEXT: ret void
;
; UNALIGNED-LABEL: define amdgpu_kernel void @merge_align_4(
; UNALIGNED-SAME: ptr addrspace(7) nocapture [[P:%.*]], ptr addrspace(7) nocapture [[P2:%.*]]) {
; UNALIGNED-NEXT: [[ENTRY:.*:]]
; UNALIGNED-NEXT: [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
; UNALIGNED-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(7) [[GEP_M8]], align 4
; UNALIGNED-NEXT: [[LD_M81:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
; UNALIGNED-NEXT: [[LD_M42:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
; UNALIGNED-NEXT: [[LD_03:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
; UNALIGNED-NEXT: [[LD_44:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
; UNALIGNED-NEXT: ret void
;
entry:
%gep_m8 = getelementptr i8, ptr addrspace(7) %p, i32 -8
%ld_m8 = load i32, ptr addrspace(7) %gep_m8, align 4
%gep_m4 = getelementptr i8, ptr addrspace(7) %p, i32 -4
%ld_m4 = load i32, ptr addrspace(7) %gep_m4, align 4
%gep_0 = getelementptr i8, ptr addrspace(7) %p, i32 0
%ld_0 = load i32, ptr addrspace(7) %gep_0, align 4
%gep_4 = getelementptr i8, ptr addrspace(7) %p, i64 4
%ld_4 = load i32, ptr addrspace(7) %gep_4, align 4
ret void
}

; The test checks that require-naturally-aligned-buffer-access target feature does not prevent merging loads if the target load would be naturally aligned.

define amdgpu_kernel void @merge_align_16(ptr addrspace(7) nocapture %p, ptr addrspace(7) nocapture %p2) #0 {
; ALIGNED-LABEL: define amdgpu_kernel void @merge_align_16(
; ALIGNED-SAME: ptr addrspace(7) nocapture [[P:%.*]], ptr addrspace(7) nocapture [[P2:%.*]]) #[[ATTR0]] {
; ALIGNED-NEXT: [[ENTRY:.*:]]
; ALIGNED-NEXT: [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
; ALIGNED-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(7) [[GEP_M8]], align 16
; ALIGNED-NEXT: [[LD_M81:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
; ALIGNED-NEXT: [[LD_M42:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
; ALIGNED-NEXT: [[LD_03:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
; ALIGNED-NEXT: [[LD_44:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
; ALIGNED-NEXT: ret void
;
; UNALIGNED-LABEL: define amdgpu_kernel void @merge_align_16(
; UNALIGNED-SAME: ptr addrspace(7) nocapture [[P:%.*]], ptr addrspace(7) nocapture [[P2:%.*]]) {
; UNALIGNED-NEXT: [[ENTRY:.*:]]
; UNALIGNED-NEXT: [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
; UNALIGNED-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(7) [[GEP_M8]], align 16
; UNALIGNED-NEXT: [[LD_M81:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
; UNALIGNED-NEXT: [[LD_M42:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
; UNALIGNED-NEXT: [[LD_03:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
; UNALIGNED-NEXT: [[LD_44:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
; UNALIGNED-NEXT: ret void
;
entry:
%gep_m8 = getelementptr i8, ptr addrspace(7) %p, i32 -8
%ld_m8 = load i32, ptr addrspace(7) %gep_m8, align 16
%gep_m4 = getelementptr i8, ptr addrspace(7) %p, i32 -4
%ld_m4 = load i32, ptr addrspace(7) %gep_m4, align 4
%gep_0 = getelementptr i8, ptr addrspace(7) %p, i32 0
%ld_0 = load i32, ptr addrspace(7) %gep_0, align 8
%gep_4 = getelementptr i8, ptr addrspace(7) %p, i64 4
%ld_4 = load i32, ptr addrspace(7) %gep_4, align 4
ret void
}
Loading