Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1812,6 +1812,13 @@ The AMDGPU backend supports the following LLVM IR attributes.
offset by one less than the number of dynamic VGPR blocks required
by the function encoded in bits 5..3.

"amdgpu-cluster-dims"="x,y,z" Specify the cluster workgroup dimensions. A value of "0,0,0" indicates that
cluster is disabled. A value of "1024,1024,1024" indicates that cluster is enabled,
but the dimensions cannot be determined at compile time. Any other value explicitly
specifies the cluster dimensions.

This is only relevant on targets with cluster support.

================================================ ==========================================================

Calling Conventions
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,14 @@ AMDGPUFunctionArgInfo::getPreloadedValue(
case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
return std::tuple(WorkGroupIDZ ? &WorkGroupIDZ : nullptr,
&AMDGPU::SGPR_32RegClass, LLT::scalar(32));
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X:
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y:
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z:
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X:
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y:
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z:
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID:
return std::tuple(nullptr, &AMDGPU::SGPR_32RegClass, LLT::scalar(32));
case AMDGPUFunctionArgInfo::LDS_KERNEL_ID:
return std::tuple(LDSKernelId ? &LDSKernelId : nullptr,
&AMDGPU::SGPR_32RegClass, LLT::scalar(32));
Expand Down
19 changes: 13 additions & 6 deletions llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,18 +111,25 @@ struct AMDGPUFunctionArgInfo {
DISPATCH_ID = 4,
FLAT_SCRATCH_INIT = 5,
LDS_KERNEL_ID = 6, // LLVM internal, not part of the ABI
WORKGROUP_ID_X = 10,
WORKGROUP_ID_Y = 11,
WORKGROUP_ID_Z = 12,
WORKGROUP_ID_X = 10, // Also used for cluster ID X.
WORKGROUP_ID_Y = 11, // Also used for cluster ID Y.
WORKGROUP_ID_Z = 12, // Also used for cluster ID Z.
PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
IMPLICIT_BUFFER_PTR = 15,
IMPLICIT_ARG_PTR = 16,
PRIVATE_SEGMENT_SIZE = 17,
CLUSTER_WORKGROUP_ID_X = 21,
CLUSTER_WORKGROUP_ID_Y = 22,
CLUSTER_WORKGROUP_ID_Z = 23,
CLUSTER_WORKGROUP_MAX_ID_X = 24,
CLUSTER_WORKGROUP_MAX_ID_Y = 25,
CLUSTER_WORKGROUP_MAX_ID_Z = 26,
CLUSTER_WORKGROUP_MAX_FLAT_ID = 27,

// VGPRS:
WORKITEM_ID_X = 18,
WORKITEM_ID_Y = 19,
WORKITEM_ID_Z = 20,
WORKITEM_ID_X = 28,
WORKITEM_ID_Y = 29,
WORKITEM_ID_Z = 30,
FIRST_VGPR_VALUE = WORKITEM_ID_X
};
// clang-format on
Expand Down
221 changes: 212 additions & 9 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4452,6 +4452,74 @@ void AMDGPULegalizerInfo::buildLoadInputValue(Register DstReg,
}
}

bool AMDGPULegalizerInfo::legalizeWorkGroupId(
MachineInstr &MI, MachineIRBuilder &B,
AMDGPUFunctionArgInfo::PreloadedValue WorkGroupIdPV,
AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
Register DstReg = MI.getOperand(0).getReg();
if (!ST.hasClusters()) {
if (!loadInputValue(DstReg, B, WorkGroupIdPV))
return false;
MI.eraseFromParent();
return true;
}

// Clusters are supported. Return the global position in the grid. If clusters
// are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.

// WorkGroupIdXYZ = ClusterId == 0 ?
// ClusterIdXYZ :
// ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
MachineRegisterInfo &MRI = *B.getMRI();
const LLT S32 = LLT::scalar(32);
Register ClusterIdXYZ = MRI.createGenericVirtualRegister(S32);
Register ClusterMaxIdXYZ = MRI.createGenericVirtualRegister(S32);
Register ClusterWorkGroupIdXYZ = MRI.createGenericVirtualRegister(S32);
if (!loadInputValue(ClusterIdXYZ, B, WorkGroupIdPV) ||
!loadInputValue(ClusterWorkGroupIdXYZ, B, ClusterWorkGroupIdPV) ||
!loadInputValue(ClusterMaxIdXYZ, B, ClusterMaxIdPV))
return false;

auto One = B.buildConstant(S32, 1);
auto ClusterSizeXYZ = B.buildAdd(S32, ClusterMaxIdXYZ, One);
auto GlobalIdXYZ = B.buildAdd(S32, ClusterWorkGroupIdXYZ,
B.buildMul(S32, ClusterIdXYZ, ClusterSizeXYZ));

const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();

switch (MFI->getClusterDims().getKind()) {
case AMDGPU::ClusterDimsAttr::Kind::FixedDims:
case AMDGPU::ClusterDimsAttr::Kind::VariableDims: {
B.buildCopy(DstReg, GlobalIdXYZ);
MI.eraseFromParent();
return true;
}
case AMDGPU::ClusterDimsAttr::Kind::NoCluster: {
B.buildCopy(DstReg, ClusterIdXYZ);
MI.eraseFromParent();
return true;
}
case AMDGPU::ClusterDimsAttr::Kind::Unknown: {
using namespace AMDGPU::Hwreg;
unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
Register ClusterId = MRI.createGenericVirtualRegister(S32);
MRI.setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
B.buildInstr(AMDGPU::S_GETREG_B32_const)
.addDef(ClusterId)
.addImm(ClusterIdField);
auto Zero = B.buildConstant(S32, 0);
auto NoClusters =
B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), ClusterId, Zero);
B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
MI.eraseFromParent();
return true;
}
}

llvm_unreachable("nothing should reach here");
}

bool AMDGPULegalizerInfo::loadInputValue(
Register DstReg, MachineIRBuilder &B,
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
Expand All @@ -4471,8 +4539,31 @@ bool AMDGPULegalizerInfo::loadInputValue(
AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
const ArgDescriptor WorkGroupIDZ =
ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
const ArgDescriptor ClusterWorkGroupIDX =
ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
const ArgDescriptor ClusterWorkGroupIDY =
ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
const ArgDescriptor ClusterWorkGroupIDZ =
ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
const ArgDescriptor ClusterWorkGroupMaxIDX =
ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
const ArgDescriptor ClusterWorkGroupMaxIDY =
ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
const ArgDescriptor ClusterWorkGroupMaxIDZ =
ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
const ArgDescriptor ClusterWorkGroupMaxFlatID =
ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);

auto LoadConstant = [&](unsigned N) {
B.buildConstant(DstReg, N);
return true;
};

if (ST.hasArchitectedSGPRs() &&
(AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
AMDGPU::ClusterDimsAttr ClusterDims = MFI->getClusterDims();
bool HasFixedDims = ClusterDims.isFixedDims();

switch (ArgType) {
case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
Arg = &WorkGroupIDX;
Expand All @@ -4489,6 +4580,53 @@ bool AMDGPULegalizerInfo::loadInputValue(
ArgRC = &AMDGPU::SReg_32RegClass;
ArgTy = LLT::scalar(32);
break;
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X:
if (HasFixedDims && ClusterDims.getDims()[0] == 1)
return LoadConstant(0);
Arg = &ClusterWorkGroupIDX;
ArgRC = &AMDGPU::SReg_32RegClass;
ArgTy = LLT::scalar(32);
break;
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y:
if (HasFixedDims && ClusterDims.getDims()[1] == 1)
return LoadConstant(0);
Arg = &ClusterWorkGroupIDY;
ArgRC = &AMDGPU::SReg_32RegClass;
ArgTy = LLT::scalar(32);
break;
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z:
if (HasFixedDims && ClusterDims.getDims()[2] == 1)
return LoadConstant(0);
Arg = &ClusterWorkGroupIDZ;
ArgRC = &AMDGPU::SReg_32RegClass;
ArgTy = LLT::scalar(32);
break;
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X:
if (HasFixedDims)
return LoadConstant(ClusterDims.getDims()[0] - 1);
Arg = &ClusterWorkGroupMaxIDX;
ArgRC = &AMDGPU::SReg_32RegClass;
ArgTy = LLT::scalar(32);
break;
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y:
if (HasFixedDims)
return LoadConstant(ClusterDims.getDims()[1] - 1);
Arg = &ClusterWorkGroupMaxIDY;
ArgRC = &AMDGPU::SReg_32RegClass;
ArgTy = LLT::scalar(32);
break;
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z:
if (HasFixedDims)
return LoadConstant(ClusterDims.getDims()[2] - 1);
Arg = &ClusterWorkGroupMaxIDZ;
ArgRC = &AMDGPU::SReg_32RegClass;
ArgTy = LLT::scalar(32);
break;
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID:
Arg = &ClusterWorkGroupMaxFlatID;
ArgRC = &AMDGPU::SReg_32RegClass;
ArgTy = LLT::scalar(32);
break;
default:
break;
}
Expand All @@ -4499,10 +4637,9 @@ bool AMDGPULegalizerInfo::loadInputValue(

if (!Arg) {
if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
// The intrinsic may appear when we have a 0 sized kernarg segment, in which
// case the pointer argument may be missing and we use null.
B.buildConstant(DstReg, 0);
return true;
// The intrinsic may appear when we have a 0 sized kernarg segment, in
// which case the pointer argument may be missing and we use null.
return LoadConstant(0);
}

// It's undefined behavior if a function marked with the amdgpu-no-*
Expand Down Expand Up @@ -7415,6 +7552,22 @@ bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
return true;
}

bool AMDGPULegalizerInfo::legalizeConstHwRegRead(MachineInstr &MI,
MachineIRBuilder &B,
AMDGPU::Hwreg::Id HwReg,
unsigned LowBit,
unsigned Width) const {
MachineRegisterInfo &MRI = *B.getMRI();
Register DstReg = MI.getOperand(0).getReg();
if (!MRI.getRegClassOrNull(DstReg))
MRI.setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
Comment on lines +7562 to +7563
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move this directly to selection then? Also use the normal constraint function. Having a class set previously doesn't necessarily mean it was the right one?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move this directly to selection then?

Which part?

Also use the normal constraint function.

What is the normal constraint function?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

constrainSelectedInstRegOperands, or constrainGenericRegister if you really want to micro-optimize and handle the one operand

B.buildInstr(AMDGPU::S_GETREG_B32_const)
.addDef(DstReg)
.addImm(AMDGPU::Hwreg::HwregEncoding::encode(HwReg, LowBit, Width));
MI.eraseFromParent();
return true;
}

static constexpr unsigned FPEnvModeBitField =
AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);

Expand Down Expand Up @@ -7577,14 +7730,64 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
case Intrinsic::amdgcn_workgroup_id_x:
return legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
return legalizeWorkGroupId(
MI, B, AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X,
AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
case Intrinsic::amdgcn_workgroup_id_y:
return legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
return legalizeWorkGroupId(
MI, B, AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y,
AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
case Intrinsic::amdgcn_workgroup_id_z:
return legalizePreloadedArgIntrin(MI, MRI, B,
return legalizeWorkGroupId(
MI, B, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z,
AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
case Intrinsic::amdgcn_cluster_id_x:
return ST.hasClusters() &&
legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
case Intrinsic::amdgcn_cluster_id_y:
return ST.hasClusters() &&
legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
case Intrinsic::amdgcn_cluster_id_z:
return ST.hasClusters() &&
legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
case Intrinsic::amdgcn_cluster_workgroup_id_x:
return ST.hasClusters() &&
legalizePreloadedArgIntrin(
MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
case Intrinsic::amdgcn_cluster_workgroup_id_y:
return ST.hasClusters() &&
legalizePreloadedArgIntrin(
MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
case Intrinsic::amdgcn_cluster_workgroup_id_z:
return ST.hasClusters() &&
legalizePreloadedArgIntrin(
MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
case Intrinsic::amdgcn_cluster_workgroup_flat_id:
return ST.hasClusters() &&
legalizeConstHwRegRead(MI, B, AMDGPU::Hwreg::ID_IB_STS2, 21, 4);
case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
return ST.hasClusters() &&
legalizePreloadedArgIntrin(
MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X);
case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
return ST.hasClusters() &&
legalizePreloadedArgIntrin(
MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y);
case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
return ST.hasClusters() &&
legalizePreloadedArgIntrin(
MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z);
case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
return ST.hasClusters() &&
legalizePreloadedArgIntrin(
MI, MRI, B,
AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID);
case Intrinsic::amdgcn_wave_id:
return legalizeWaveID(MI, B);
case Intrinsic::amdgcn_lds_kernel_id:
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,11 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B,
const ArgDescriptor *Arg,
const TargetRegisterClass *ArgRC, LLT ArgTy) const;
bool legalizeWorkGroupId(
MachineInstr &MI, MachineIRBuilder &B,
AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV,
AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const;
bool loadInputValue(Register DstReg, MachineIRBuilder &B,
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;

Expand Down Expand Up @@ -218,6 +223,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {

bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const;
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const;
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B,
AMDGPU::Hwreg::Id HwReg, unsigned LowBit,
unsigned Width) const;

bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
Expand Down
Loading
Loading