Skip to content

Commit a942c6b

Browse files
committed
[AMDGPU] Handle nontemporal and amdgpu.last.use metadata in amdgpu-lower-buffer-fat-pointers
1 parent f9c8c01 commit a942c6b

File tree

4 files changed

+734
-22
lines changed

4 files changed

+734
-22
lines changed

llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1074,18 +1074,6 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
10741074
Args.push_back(IRB.getInt32(0));
10751075

10761076
uint32_t Aux = 0;
1077-
bool IsInvariant =
1078-
(isa<LoadInst>(I) && I->getMetadata(LLVMContext::MD_invariant_load));
1079-
bool IsNonTemporal = I->getMetadata(LLVMContext::MD_nontemporal);
1080-
// Atomic loads and stores need glc, atomic read-modify-write doesn't.
1081-
bool IsOneWayAtomic =
1082-
!isa<AtomicRMWInst>(I) && Order != AtomicOrdering::NotAtomic;
1083-
if (IsOneWayAtomic)
1084-
Aux |= AMDGPU::CPol::GLC;
1085-
if (IsNonTemporal && !IsInvariant)
1086-
Aux |= AMDGPU::CPol::SLC;
1087-
if (isa<LoadInst>(I) && ST->getGeneration() == AMDGPUSubtarget::GFX10)
1088-
Aux |= (Aux & AMDGPU::CPol::GLC ? AMDGPU::CPol::DLC : 0);
10891077
if (IsVolatile)
10901078
Aux |= AMDGPU::CPol::VOLATILE;
10911079
Args.push_back(IRB.getInt32(Aux));

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1240,6 +1240,11 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
12401240
}
12411241

12421242
Info.flags |= MachineMemOperand::MODereferenceable;
1243+
1244+
if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1245+
Info.flags |= MachineMemOperand::MONonTemporal;
1246+
Info.flags |= getTargetMMOFlags(CI);
1247+
12431248
if (ME.onlyReadsMemory()) {
12441249
if (RsrcIntr->IsImage) {
12451250
unsigned MaxNumLanes = 4;

llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-memops.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,16 @@ define void @loads(ptr addrspace(8) %buf) {
1111
; CHECK-NEXT: [[SCALAR:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 0)
1212
; CHECK-NEXT: [[VEC2:%.*]] = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) align 8 [[BUF]], i32 16, i32 0, i32 0)
1313
; CHECK-NEXT: [[VEC4:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
14-
; CHECK-NEXT: [[NONTEMPORAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 2), !nontemporal [[META0:![0-9]+]]
14+
; CHECK-NEXT: [[NONTEMPORAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 0), !nontemporal [[META0:![0-9]+]]
1515
; CHECK-NEXT: [[INVARIANT:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 0), !invariant.load [[META1:![0-9]+]]
1616
; CHECK-NEXT: [[NONTEMPORAL_INVARIANT:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 0), !invariant.load [[META1]], !nontemporal [[META0]]
1717
; CHECK-NEXT: [[VOLATILE:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483648)
18-
; CHECK-NEXT: [[VOLATILE_NONTEMPORAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483646), !nontemporal [[META0]]
18+
; CHECK-NEXT: [[VOLATILE_NONTEMPORAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483648), !nontemporal [[META0]]
1919
; CHECK-NEXT: fence syncscope("wavefront") release
20-
; CHECK-NEXT: [[ATOMIC:%.*]] = call float @llvm.amdgcn.raw.ptr.atomic.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483647)
20+
; CHECK-NEXT: [[ATOMIC:%.*]] = call float @llvm.amdgcn.raw.ptr.atomic.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483648)
2121
; CHECK-NEXT: fence syncscope("wavefront") acquire
22-
; CHECK-NEXT: [[ATOMIC_MONOTONIC:%.*]] = call float @llvm.amdgcn.raw.ptr.atomic.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 1)
23-
; CHECK-NEXT: [[ATOMIC_ACQUIRE:%.*]] = call float @llvm.amdgcn.raw.ptr.atomic.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 1)
22+
; CHECK-NEXT: [[ATOMIC_MONOTONIC:%.*]] = call float @llvm.amdgcn.raw.ptr.atomic.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 0)
23+
; CHECK-NEXT: [[ATOMIC_ACQUIRE:%.*]] = call float @llvm.amdgcn.raw.ptr.atomic.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 0)
2424
; CHECK-NEXT: fence acquire
2525
; CHECK-NEXT: ret void
2626
;
@@ -50,15 +50,15 @@ define void @stores(ptr addrspace(8) %buf, float %f, <4 x float> %f4) {
5050
; CHECK-SAME: (ptr addrspace(8) [[BUF:%.*]], float [[F:%.*]], <4 x float> [[F4:%.*]]) #[[ATTR0]] {
5151
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 0)
5252
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[F4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
53-
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 2), !nontemporal [[META0]]
53+
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 0), !nontemporal [[META0]]
5454
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483648)
55-
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483646), !nontemporal [[META0]]
55+
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483648), !nontemporal [[META0]]
5656
; CHECK-NEXT: fence syncscope("wavefront") release
57-
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483647)
57+
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483648)
5858
; CHECK-NEXT: fence syncscope("wavefront") acquire
59-
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 1)
59+
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 0)
6060
; CHECK-NEXT: fence release
61-
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 1)
61+
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 0)
6262
; CHECK-NEXT: ret void
6363
;
6464
%base = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)

0 commit comments

Comments
 (0)