Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,9 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_monitor_b128, "V4iV4i*1Ii", "nc", "g
TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b32, "ii*0Ii", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b64, "V2iV2i*0Ii", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b128, "V4iV4i*0Ii", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cluster_load_b32, "ii*1Iii", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_cluster_load_b64, "V2iV2i*1Iii", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_cluster_load_b128, "V4iV4i*1Iii", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b8, "vc*1c*3IiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b32, "vi*1i*3IiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b64, "vV2i*1V2i*3IiIi", "nc", "gfx1250-insts")
Expand Down
21 changes: 21 additions & 0 deletions clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -675,6 +675,27 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
return Builder.CreateCall(F, {Addr, Val});
}
case AMDGPU::BI__builtin_amdgcn_cluster_load_b32:
case AMDGPU::BI__builtin_amdgcn_cluster_load_b64:
case AMDGPU::BI__builtin_amdgcn_cluster_load_b128: {
Intrinsic::ID IID;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_cluster_load_b32:
IID = Intrinsic::amdgcn_cluster_load_b32;
break;
case AMDGPU::BI__builtin_amdgcn_cluster_load_b64:
IID = Intrinsic::amdgcn_cluster_load_b64;
break;
case AMDGPU::BI__builtin_amdgcn_cluster_load_b128:
IID = Intrinsic::amdgcn_cluster_load_b128;
break;
}
SmallVector<Value *, 3> Args;
for (int i = 0, e = E->getNumArgs(); i != e; ++i)
Args.push_back(EmitScalarExpr(E->getArg(i)));
llvm::Function *F = CGM.getIntrinsic(IID, {ConvertType(E->getType())});
return Builder.CreateCall(F, {Args});
}
case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
// Should this have asan instrumentation?
return emitBuiltinWithOneOverloadedType<5>(*this, E,
Expand Down
36 changes: 36 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-cluster-load.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250

typedef int v2i __attribute__((ext_vector_type(2)));
typedef int v4i __attribute__((ext_vector_type(4)));

// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_b32(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.cluster.load.b32.i32(ptr addrspace(1) [[INPTR:%.*]], i32 10, i32 [[MASK:%.*]])
// CHECK-GFX1250-NEXT: ret i32 [[TMP0]]
//
int test_amdgcn_cluster_load_b32(global int* inptr, int mask)
{
return __builtin_amdgcn_cluster_load_b32(inptr, 10, mask);
}

// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_b64(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.cluster.load.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]], i32 22, i32 [[MASK:%.*]])
// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]]
//
v2i test_amdgcn_cluster_load_b64(global v2i* inptr, int mask)
{
return __builtin_amdgcn_cluster_load_b64(inptr, 22, mask);
}

// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_b128(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.cluster.load.b128.v4i32(ptr addrspace(1) [[INPTR:%.*]], i32 27, i32 [[MASK:%.*]])
// CHECK-GFX1250-NEXT: ret <4 x i32> [[TMP0]]
//
v4i test_amdgcn_cluster_load_b128(global v4i* inptr, int mask)
{
return __builtin_amdgcn_cluster_load_b128(inptr, 27, mask);
}
7 changes: 7 additions & 0 deletions clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,13 @@ void test_amdgcn_load_monitor(global int* b32gaddr, global v2i* b64gaddr, global
*b128out = __builtin_amdgcn_flat_load_monitor_b128(b128faddr, cpol); // expected-error {{'__builtin_amdgcn_flat_load_monitor_b128' must be a constant integer}}
}

void test_amdgcn_cluster_load(global int* addr32, global v2i* addr64, global v4i* addr128, global int* b32out, global v2i* b64out, global v4i* b128out, int cpol, int mask)
{
*b32out = __builtin_amdgcn_cluster_load_b32(addr32, cpol, mask); // expected-error {{'__builtin_amdgcn_cluster_load_b32' must be a constant integer}}
*b64out = __builtin_amdgcn_cluster_load_b64(addr64, cpol, mask); // expected-error {{'__builtin_amdgcn_cluster_load_b64' must be a constant integer}}
*b128out = __builtin_amdgcn_cluster_load_b128(addr128, cpol, mask); // expected-error {{'__builtin_amdgcn_cluster_load_b128' must be a constant integer}}
}

void test_amdgcn_async_load_store_lds_offset(global char* gaddr8, global int *gaddr32, global v2i* gaddr64, global v4i* gaddr128, local char* laddr8,
local int *laddr32, local v2i* laddr64, local v4i* laddr128, int offset, int mask)
{
Expand Down
17 changes: 17 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -4113,6 +4113,23 @@ def int_amdgcn_tensor_load_to_lds_d2 :
def int_amdgcn_tensor_store_from_lds_d2 :
ClangBuiltin<"__builtin_amdgcn_tensor_store_from_lds_d2">, AMDGPUTensorLoadStoreD2;

class AMDGPUClusterLoad<LLVMType ptr_ty>:
Intrinsic<
[llvm_any_ty],
[ptr_ty,
llvm_i32_ty, // gfx12+ cachepolicy:
// bits [0-2] = th
// bits [3-4] = scope
llvm_i32_ty], // workgroup broadcast mask (in M0)
[IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, IntrWillReturn, IntrConvergent,
NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree],
"", [SDNPMemOperand]
>;

def int_amdgcn_cluster_load_b32 : AMDGPUClusterLoad<global_ptr_ty>;
def int_amdgcn_cluster_load_b64 : AMDGPUClusterLoad<global_ptr_ty>;
def int_amdgcn_cluster_load_b128 : AMDGPUClusterLoad<global_ptr_ty>;

class AMDGPULoadMonitor<LLVMType ptr_ty>:
Intrinsic<
[llvm_any_ty],
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUGISel.td
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,9 @@ def gi_global_saddr :
def gi_global_saddr_cpol :
GIComplexOperandMatcher<s64, "selectGlobalSAddrCPol">,
GIComplexPatternEquiv<GlobalSAddrCPol>;
def gi_global_saddr_cpol_m0 :
GIComplexOperandMatcher<s64, "selectGlobalSAddrCPolM0">,
GIComplexPatternEquiv<GlobalSAddrCPolM0>;
def gi_global_saddr_glc :
GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">,
GIComplexPatternEquiv<GlobalSAddrGLC>;
Expand Down
17 changes: 17 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2089,6 +2089,23 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr,
return true;
}

bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr,
SDValue &SAddr,
SDValue &VOffset,
SDValue &Offset,
SDValue &CPol) const {
bool ScaleOffset;
if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
return false;

// We are assuming CPol is second from last operand of the intrinsic.
auto PassedCPol =
N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
CPol = CurDAG->getTargetConstant(
(ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
return true;
}

bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
SDValue &SAddr, SDValue &VOffset,
SDValue &Offset,
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,9 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool SelectGlobalSAddrCPol(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset,
SDValue &CPol) const;
bool SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset,
SDValue &CPol) const;
bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset,
SDValue &CPol) const;
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5708,6 +5708,16 @@ AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
return selectGlobalSAddr(Root, PassedCPol);
}

InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const {
const MachineInstr &I = *Root.getParent();

// We are assuming CPol is second from last operand of the intrinsic.
auto PassedCPol =
I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
return selectGlobalSAddr(Root, PassedCPol);
}

InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
InstructionSelector::ComplexRendererFns
selectGlobalSAddrCPol(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectGlobalSAddrCPolM0(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectGlobalSAddrGLC(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectGlobalSAddrNoIOffset(MachineOperand &Root) const;
Expand Down
17 changes: 17 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3338,6 +3338,13 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
applyDefaultMapping(OpdMapper);
constrainOpWithReadfirstlane(B, MI, 8); // M0
return;
case Intrinsic::amdgcn_cluster_load_b32:
case Intrinsic::amdgcn_cluster_load_b64:
case Intrinsic::amdgcn_cluster_load_b128: {
applyDefaultMapping(OpdMapper);
constrainOpWithReadfirstlane(B, MI, 4); // M0
return;
}
case Intrinsic::amdgcn_s_sleep_var:
assert(OpdMapper.getVRegs(1).empty());
constrainOpWithReadfirstlane(B, MI, 1);
Expand Down Expand Up @@ -5466,6 +5473,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
break;
}
case Intrinsic::amdgcn_cluster_load_b32:
case Intrinsic::amdgcn_cluster_load_b64:
case Intrinsic::amdgcn_cluster_load_b128: {
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
unsigned M0Bank =
getRegBankID(MI.getOperand(4).getReg(), MRI, AMDGPU::SGPRRegBankID);
OpdsMapping[4] = AMDGPU::getValueMapping(M0Bank, 32);
break;
}
case Intrinsic::amdgcn_global_store_async_from_lds_b8:
case Intrinsic::amdgcn_global_store_async_from_lds_b32:
case Intrinsic::amdgcn_global_store_async_from_lds_b64:
Expand Down
37 changes: 37 additions & 0 deletions llvm/lib/Target/AMDGPU/FLATInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ let WantsRoot = true in {
def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>;
def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>;
def GlobalSAddrCPol : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPol", [], [], -10>;
def GlobalSAddrCPolM0 : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPolM0", [], [], -10>;
def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [], -10>;
def ScratchSVAddr : ComplexPattern<iPTR, 4, "SelectScratchSVAddr", [], [], -10>;
}
Expand Down Expand Up @@ -1248,6 +1249,14 @@ defm GLOBAL_LOAD_MONITOR_B64 : FLAT_Global_Load_Pseudo <"global_load_monitor_b6
defm GLOBAL_LOAD_MONITOR_B128 : FLAT_Global_Load_Pseudo <"global_load_monitor_b128", VReg_128>;
} // End SubtargetPredicate = isGFX125xOnly

let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32 in {
let Uses = [M0, EXEC] in { // Use M0 for broadcast workgroup mask.
defm CLUSTER_LOAD_B32 : FLAT_Global_Load_Pseudo <"cluster_load_b32", VGPR_32>;
defm CLUSTER_LOAD_B64 : FLAT_Global_Load_Pseudo <"cluster_load_b64", VReg_64>;
defm CLUSTER_LOAD_B128 : FLAT_Global_Load_Pseudo <"cluster_load_b128", VReg_128>;
} // End Uses = [M0, EXEC]
} // End SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32

let SubtargetPredicate = isGFX12Plus in {
let Uses = [EXEC, M0] in {
defm GLOBAL_LOAD_BLOCK : FLAT_Global_Load_Pseudo <"global_load_block", VReg_1024>;
Expand Down Expand Up @@ -1394,6 +1403,16 @@ class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt>
(inst $saddr, $voffset, $offset, $cpol)
>;

class FlatLoadSignedPat_M0 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), (i32 timm:$cpol), M0)),
(inst $vaddr, $offset, $cpol)
>;

class GlobalLoadSaddrPat_M0 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalSAddrCPolM0 (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm), M0)),
(inst $saddr, $voffset, $offset, $cpol)
>;

class FlatLoadSignedPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), (i32 timm:$cpol))),
(inst $vaddr, $offset, $cpol)
Expand Down Expand Up @@ -1619,6 +1638,16 @@ multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTyp
}
}

multiclass GlobalFLATLoadPats_M0<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
def : FlatLoadSignedPat_M0 <inst, node, vt> {
let AddedComplexity = 10;
}

def : GlobalLoadSaddrPat_M0<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
let AddedComplexity = 11;
}
}

multiclass GlobalFLATLoadPats_CPOL<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
def : FlatLoadSignedPat_CPOL<inst, node, vt> {
let AddedComplexity = 10;
Expand Down Expand Up @@ -2176,6 +2205,10 @@ let OtherPredicates = [isGFX125xOnly] in {
} // End SubtargetPredicate = isGFX125xOnly

let OtherPredicates = [isGFX1250Plus] in {
defm : GlobalFLATLoadPats_M0 <CLUSTER_LOAD_B32, int_amdgcn_cluster_load_b32, i32>;
defm : GlobalFLATLoadPats_M0 <CLUSTER_LOAD_B64, int_amdgcn_cluster_load_b64, v2i32>;
defm : GlobalFLATLoadPats_M0 <CLUSTER_LOAD_B128, int_amdgcn_cluster_load_b128, v4i32>;

defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B8, int_amdgcn_global_load_async_to_lds_b8>;
defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B32, int_amdgcn_global_load_async_to_lds_b32>;
defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B64, int_amdgcn_global_load_async_to_lds_b64>;
Expand Down Expand Up @@ -3470,6 +3503,10 @@ defm GLOBAL_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>;
defm GLOBAL_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>;
defm GLOBAL_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>;

defm CLUSTER_LOAD_B32 : VFLAT_Real_AllAddr_gfx1250<0x067>;
defm CLUSTER_LOAD_B64 : VFLAT_Real_AllAddr_gfx1250<0x068>;
defm CLUSTER_LOAD_B128 : VFLAT_Real_AllAddr_gfx1250<0x069>;

defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x5f>;
defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x60>;
defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x61>;
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1506,6 +1506,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::amdgcn_global_load_monitor_b32:
case Intrinsic::amdgcn_global_load_monitor_b64:
case Intrinsic::amdgcn_global_load_monitor_b128:
case Intrinsic::amdgcn_cluster_load_b32:
case Intrinsic::amdgcn_cluster_load_b64:
case Intrinsic::amdgcn_cluster_load_b128:
case Intrinsic::amdgcn_ds_load_tr6_b96:
case Intrinsic::amdgcn_ds_load_tr4_b64:
case Intrinsic::amdgcn_ds_load_tr8_b64:
Expand Down Expand Up @@ -1636,6 +1639,9 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
Value *Ptr = nullptr;
switch (II->getIntrinsicID()) {
case Intrinsic::amdgcn_atomic_cond_sub_u32:
case Intrinsic::amdgcn_cluster_load_b128:
case Intrinsic::amdgcn_cluster_load_b64:
case Intrinsic::amdgcn_cluster_load_b32:
case Intrinsic::amdgcn_ds_append:
case Intrinsic::amdgcn_ds_consume:
case Intrinsic::amdgcn_ds_load_tr8_b64:
Expand Down
Loading
Loading