Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -696,6 +696,7 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16
//===----------------------------------------------------------------------===//
// GFX1250+ only builtins.
//===----------------------------------------------------------------------===//
TARGET_BUILTIN(__builtin_amdgcn_s_cluster_barrier, "v", "n", "gfx1250-insts")

TARGET_BUILTIN(__builtin_amdgcn_flat_prefetch, "vvC*0Ii", "nc", "vmem-pref-insts")
TARGET_BUILTIN(__builtin_amdgcn_global_prefetch, "vvC*1Ii", "nc", "vmem-pref-insts")
Expand Down
10 changes: 10 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
Original file line number Diff line number Diff line change
Expand Up @@ -1478,6 +1478,16 @@ void test_prefetch(generic void *fptr, global void *gptr) {
__builtin_amdgcn_global_prefetch(gptr, 8);
}

// CHECK-LABEL: @test_s_cluster_barrier(
// CHECK-NEXT: entry:
// CHECK-NEXT: call void @llvm.amdgcn.s.cluster.barrier()
// CHECK-NEXT: ret void
//
void test_s_cluster_barrier()
{
__builtin_amdgcn_s_cluster_barrier();
}

// CHECK-LABEL: @test_global_add_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
Expand Down
4 changes: 4 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -3691,6 +3691,10 @@ def int_amdgcn_ashr_pk_u8_i32 : ClangBuiltin<"__builtin_amdgcn_ashr_pk_u8_i32">,
// gfx1250 intrinsics
// ===----------------------------------------------------------------------===//

// Vanilla cluster sync-barrier
def int_amdgcn_s_cluster_barrier : ClangBuiltin<"__builtin_amdgcn_s_cluster_barrier">,
Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

// Async waits decrement ASYNCcnt and tensor waits decrement TENSORcnt which is
// modeled as InaccessibleMem.
class AMDGPUWaitAsyncIntrinsic :
Expand Down
68 changes: 60 additions & 8 deletions llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/InitializePasses.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"

#define DEBUG_TYPE "amdgpu-lower-intrinsics"

Expand Down Expand Up @@ -49,7 +50,6 @@ class AMDGPULowerIntrinsicsLegacy : public ModulePass {

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetPassConfig>();
AU.setPreservesCFG();
}
};

Expand All @@ -73,6 +73,7 @@ bool AMDGPULowerIntrinsicsImpl::run() {
case Intrinsic::amdgcn_s_barrier_signal:
case Intrinsic::amdgcn_s_barrier_signal_isfirst:
case Intrinsic::amdgcn_s_barrier_wait:
case Intrinsic::amdgcn_s_cluster_barrier:
forEachCall(F, [&](IntrinsicInst *II) { Changed |= visitBarrier(*II); });
break;
}
Expand All @@ -81,13 +82,14 @@ bool AMDGPULowerIntrinsicsImpl::run() {
return Changed;
}

// Optimize barriers and lower s_barrier to a sequence of split barrier
// intrinsics.
// Optimize barriers and lower s_(cluster_)barrier to a sequence of split
// barrier intrinsics.
bool AMDGPULowerIntrinsicsImpl::visitBarrier(IntrinsicInst &I) {
assert(I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier ||
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal ||
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal_isfirst ||
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait);
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait ||
I.getIntrinsicID() == Intrinsic::amdgcn_s_cluster_barrier);

const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*I.getFunction());
bool IsSingleWaveWG = false;
Expand All @@ -99,7 +101,59 @@ bool AMDGPULowerIntrinsicsImpl::visitBarrier(IntrinsicInst &I) {

IRBuilder<> B(&I);

if (IsSingleWaveWG) {
// Lower the s_cluster_barrier intrinsic first. There is no corresponding
// hardware instruction in any subtarget.
if (I.getIntrinsicID() == Intrinsic::amdgcn_s_cluster_barrier) {
// The default cluster barrier expects one signal per workgroup. So we need
// a workgroup barrier first.
if (IsSingleWaveWG) {
B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_wave_barrier, {});
} else {
Value *BarrierID_32 = B.getInt32(AMDGPU::Barrier::WORKGROUP);
Value *BarrierID_16 = B.getInt16(AMDGPU::Barrier::WORKGROUP);
Value *IsFirst = B.CreateIntrinsic(
B.getInt1Ty(), Intrinsic::amdgcn_s_barrier_signal_isfirst,
{BarrierID_32});
B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_wait,
{BarrierID_16});

Instruction *ThenTerm =
SplitBlockAndInsertIfThen(IsFirst, I.getIterator(), false);
B.SetInsertPoint(ThenTerm);
}

// Now we can signal the cluster barrier from a single wave and wait for the
// barrier in all waves.
Value *BarrierID_32 = B.getInt32(AMDGPU::Barrier::CLUSTER);
Value *BarrierID_16 = B.getInt16(AMDGPU::Barrier::CLUSTER);
B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_signal,
{BarrierID_32});

B.SetInsertPoint(&I);
B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_wait,
{BarrierID_16});

I.eraseFromParent();
return true;
}

bool IsWorkgroupScope = false;

if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait ||
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal ||
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal_isfirst) {
int BarrierID = cast<ConstantInt>(I.getArgOperand(0))->getSExtValue();
if (BarrierID == AMDGPU::Barrier::TRAP ||
BarrierID == AMDGPU::Barrier::WORKGROUP ||
(BarrierID >= AMDGPU::Barrier::NAMED_BARRIER_FIRST &&
BarrierID <= AMDGPU::Barrier::NAMED_BARRIER_LAST))
IsWorkgroupScope = true;
} else {
assert(I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier);
IsWorkgroupScope = true;
}

if (IsWorkgroupScope && IsSingleWaveWG) {
// Down-grade waits, remove split signals.
if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier ||
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait) {
Expand Down Expand Up @@ -134,9 +188,7 @@ PreservedAnalyses AMDGPULowerIntrinsicsPass::run(Module &M,
AMDGPULowerIntrinsicsImpl Impl(M, TM);
if (!Impl.run())
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
return PA;
return PreservedAnalyses::none();
}

bool AMDGPULowerIntrinsicsLegacy::runOnModule(Module &M) {
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,7 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
switch (II->getIntrinsicID()) {
case Intrinsic::amdgcn_s_barrier:
case Intrinsic::amdgcn_s_cluster_barrier:
case Intrinsic::amdgcn_s_barrier_signal:
case Intrinsic::amdgcn_s_barrier_signal_var:
case Intrinsic::amdgcn_s_barrier_signal_isfirst:
Expand Down
9 changes: 8 additions & 1 deletion llvm/lib/Target/AMDGPU/SIDefines.h
Original file line number Diff line number Diff line change
Expand Up @@ -1108,7 +1108,14 @@ enum Register_Flag : uint8_t {
namespace AMDGPU {
namespace Barrier {

enum Type { TRAP = -2, WORKGROUP = -1 };
enum Type {
CLUSTER_TRAP = -4,
CLUSTER = -3,
TRAP = -2,
WORKGROUP = -1,
NAMED_BARRIER_FIRST = 1,
NAMED_BARRIER_LAST = 16,
};

enum {
BARRIER_SCOPE_WORKGROUP = 0,
Expand Down
107 changes: 107 additions & 0 deletions llvm/test/CodeGen/AMDGPU/lower-intrinsics-cluster-barrier.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -codegen-opt-level=0 | FileCheck --check-prefixes=CHECK,NOOPT %s
; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -codegen-opt-level=1 -mattr=+wavefrontsize32 | FileCheck --check-prefixes=CHECK,OPT-WAVE32 %s
; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -codegen-opt-level=1 -mattr=+wavefrontsize64 | FileCheck --check-prefixes=CHECK,OPT-WAVE64 %s

declare void @foo(i1)

; Verify that the explicit use of a split cluster barrier isn't optimized away.
define amdgpu_kernel void @split_barriers() "amdgpu-flat-work-group-size"="32,32" {
; CHECK-LABEL: define amdgpu_kernel void @split_barriers(
; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3)
; CHECK-NEXT: [[ISFIRST:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -3)
; CHECK-NEXT: call void @foo(i1 [[ISFIRST]])
; CHECK-NEXT: ret void
;
call void @llvm.amdgcn.s.barrier.signal(i32 -3)
call void @llvm.amdgcn.s.barrier.wait(i16 -3)
%isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -3)
call void @foo(i1 %isfirst)
ret void
}

define amdgpu_kernel void @s_cluster_barrier() {
; CHECK-LABEL: define amdgpu_kernel void @s_cluster_barrier(
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
; CHECK-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
; CHECK: [[BB2]]:
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3)
; CHECK-NEXT: br label %[[BB3]]
; CHECK: [[BB3]]:
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3)
; CHECK-NEXT: ret void
;
call void @llvm.amdgcn.s.cluster.barrier()
ret void
}

define amdgpu_kernel void @s_cluster_barrier_wg32() "amdgpu-flat-work-group-size"="32,32" {
; NOOPT-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg32(
; NOOPT-SAME: ) #[[ATTR1]] {
; NOOPT-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
; NOOPT-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
; NOOPT: [[BB2]]:
; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3)
; NOOPT-NEXT: br label %[[BB3]]
; NOOPT: [[BB3]]:
; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3)
; NOOPT-NEXT: ret void
;
; OPT-WAVE32-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg32(
; OPT-WAVE32-SAME: ) #[[ATTR1]] {
; OPT-WAVE32-NEXT: call void @llvm.amdgcn.wave.barrier()
; OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3)
; OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3)
; OPT-WAVE32-NEXT: ret void
;
; OPT-WAVE64-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg32(
; OPT-WAVE64-SAME: ) #[[ATTR1]] {
; OPT-WAVE64-NEXT: call void @llvm.amdgcn.wave.barrier()
; OPT-WAVE64-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3)
; OPT-WAVE64-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3)
; OPT-WAVE64-NEXT: ret void
;
call void @llvm.amdgcn.s.cluster.barrier()
ret void
}

define amdgpu_kernel void @s_cluster_barrier_wg64() "amdgpu-flat-work-group-size"="64,64" {
; NOOPT-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg64(
; NOOPT-SAME: ) #[[ATTR2:[0-9]+]] {
; NOOPT-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
; NOOPT-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
; NOOPT: [[BB2]]:
; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3)
; NOOPT-NEXT: br label %[[BB3]]
; NOOPT: [[BB3]]:
; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3)
; NOOPT-NEXT: ret void
;
; OPT-WAVE32-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg64(
; OPT-WAVE32-SAME: ) #[[ATTR2:[0-9]+]] {
; OPT-WAVE32-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
; OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
; OPT-WAVE32-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
; OPT-WAVE32: [[BB2]]:
; OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3)
; OPT-WAVE32-NEXT: br label %[[BB3]]
; OPT-WAVE32: [[BB3]]:
; OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3)
; OPT-WAVE32-NEXT: ret void
;
; OPT-WAVE64-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg64(
; OPT-WAVE64-SAME: ) #[[ATTR2:[0-9]+]] {
; OPT-WAVE64-NEXT: call void @llvm.amdgcn.wave.barrier()
; OPT-WAVE64-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3)
; OPT-WAVE64-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3)
; OPT-WAVE64-NEXT: ret void
;
call void @llvm.amdgcn.s.cluster.barrier()
ret void
}