Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ BUILTIN(__builtin_amdgcn_qsad_pk_u16_u8, "WUiWUiUiWUi", "nc")
BUILTIN(__builtin_amdgcn_mqsad_pk_u16_u8, "WUiWUiUiWUi", "nc")
BUILTIN(__builtin_amdgcn_mqsad_u32_u8, "V4UiWUiUiV4Ui", "nc")

BUILTIN(__builtin_amdgcn_make_buffer_rsrc, "Qbv*sii", "nc")
BUILTIN(__builtin_amdgcn_make_buffer_rsrc, "Qbv*sWii", "nc")
BUILTIN(__builtin_amdgcn_raw_buffer_store_b8, "vUcQbiiIi", "n")
BUILTIN(__builtin_amdgcn_raw_buffer_store_b16, "vUsQbiiIi", "n")
BUILTIN(__builtin_amdgcn_raw_buffer_store_b32, "vUiQbiiIi", "n")
Expand Down
11 changes: 7 additions & 4 deletions clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP2]] to i64
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP4:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]])
// CHECK-NEXT: [[TMP4:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i64 [[CONV]], i32 [[TMP3]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP4]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, int num, int flags) {
Expand All @@ -48,8 +49,9 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short
// CHECK-NEXT: store i32 [[FLAGS]], ptr [[FLAGS_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP1]] to i64
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 4, i32 [[TMP1]], i32 [[TMP2]])
// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 4, i64 [[CONV]], i32 [[TMP2]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p, int num, int flags) {
Expand All @@ -73,7 +75,7 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constan
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 1234, i32 [[TMP2]])
// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i64 1234, i32 [[TMP2]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, short stride, int flags) {
Expand All @@ -97,7 +99,8 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(v
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 5678)
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP2]] to i64
// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i64 [[CONV]], i32 5678)
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, short stride, int num) {
Expand Down
28 changes: 18 additions & 10 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, int num, int flags) {
Expand All @@ -13,7 +14,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, in

// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_stride_constant(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 4, i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p, int num, int flags) {
Expand All @@ -22,7 +24,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p,

// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_num_constant(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i64 1234, i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, short stride, int flags) {
Expand All @@ -31,7 +33,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, sho

// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_flags_constant(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 5678)
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, short stride, int num) {
Expand All @@ -40,7 +43,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, s

// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1(global void *p, short stride, int num, int flags) {
Expand All @@ -49,7 +53,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1(global void *p, short str

// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_stride_constant(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 4, i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_stride_constant(global void *p, int num, int flags) {
Expand All @@ -58,7 +63,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_stride_constant(global vo

// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_num_constant(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i64 1234, i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_num_constant(global void *p, short stride, int flags) {
Expand All @@ -67,7 +72,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_num_constant(global void

// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_flags_constant(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 5678)
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_flags_constant(global void *p, short stride, int num) {
Expand All @@ -76,7 +82,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_flags_constant(global voi

// CHECK-LABEL: @test_amdgcn_make_buffer_p0_nullptr(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr null, i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p0_nullptr(short stride, int num, int flags) {
Expand All @@ -85,7 +92,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p0_nullptr(short stride, int num,

// CHECK-LABEL: @test_amdgcn_make_buffer_p1_nullptr(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) null, i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p1_nullptr(short stride, int num, int flags) {
Expand Down
2 changes: 1 addition & 1 deletion llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -1431,7 +1431,7 @@ def int_amdgcn_make_buffer_rsrc : DefaultAttrsIntrinsic <
[llvm_anyptr_ty],
[llvm_anyptr_ty, // base
llvm_i16_ty, // stride (and swizzle control)
llvm_i32_ty, // NumRecords / extent
llvm_i64_ty, // NumRecords / extent
llvm_i32_ty], // flags
// Attributes lifted from ptrmask + some extra argument attributes.
[IntrNoMem, ReadNone<ArgIndex<0>>,
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -1443,6 +1443,12 @@ def FeatureLdsBarrierArriveAtomic : SubtargetFeature< "lds-barrier-arrive-atomic
"Has LDS barrier-arrive atomic instructions"
>;

def Feature45BitNumRecordsBufferResource : SubtargetFeature< "45-bit-num-records-buffer-resource",
"Has45BitNumRecordsBufferResource",
"true",
"The buffer resource (V#) supports 45-bit num_records"
>;

// Dummy feature used to disable assembler instructions.
def FeatureDisable : SubtargetFeature<"",
"FeatureDisable","true",
Expand Down Expand Up @@ -2106,6 +2112,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureMadU32Inst,
FeatureLdsBarrierArriveAtomic,
FeatureSetPrioIncWgInst,
Feature45BitNumRecordsBufferResource,
]>;

def FeatureISAVersion12_51 : FeatureSet<
Expand Down
65 changes: 41 additions & 24 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5905,33 +5905,50 @@ bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
Register Flags = MI.getOperand(5).getReg();

LLT S32 = LLT::scalar(32);
LLT S64 = LLT::scalar(64);

B.setInsertPt(B.getMBB(), ++B.getInsertPt());
auto Unmerge = B.buildUnmerge(S32, Pointer);
Register LowHalf = Unmerge.getReg(0);
Register HighHalf = Unmerge.getReg(1);

auto AndMask = B.buildConstant(S32, 0x0000ffff);
auto Masked = B.buildAnd(S32, HighHalf, AndMask);

MachineInstrBuilder NewHighHalf = Masked;
std::optional<ValueAndVReg> StrideConst =
getIConstantVRegValWithLookThrough(Stride, MRI);
if (!StrideConst || !StrideConst->Value.isZero()) {
MachineInstrBuilder ShiftedStride;
if (StrideConst) {
uint32_t StrideVal = StrideConst->Value.getZExtValue();
uint32_t ShiftedStrideVal = StrideVal << 16;
ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
} else {
auto ExtStride = B.buildAnyExt(S32, Stride);
auto ShiftConst = B.buildConstant(S32, 16);
ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
}
NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);

auto ExtStride = B.buildAnyExt(S32, Stride);

if (ST.has45BitNumRecordsBufferResource()) {
Register Zero = B.buildConstant(S32, 0).getReg(0);
// Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
// num_records.
LLT PtrIntTy = LLT::scalar(MRI.getType(Pointer).getSizeInBits());
auto PointerInt = B.buildPtrToInt(PtrIntTy, Pointer);
auto ExtPointer = B.buildAnyExtOrTrunc(S64, PointerInt);
auto NumRecordsLHS = B.buildShl(S64, NumRecords, B.buildConstant(S32, 57));
Register LowHalf = B.buildOr(S64, ExtPointer, NumRecordsLHS).getReg(0);

// Build the higher 64-bit value, which has the higher 38-bit num_records,
// 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
auto NumRecordsRHS = B.buildLShr(S64, NumRecords, B.buildConstant(S32, 7));
auto ShiftedStride = B.buildShl(S32, ExtStride, B.buildConstant(S32, 12));
auto ExtShiftedStride =
B.buildMergeValues(S64, {Zero, ShiftedStride.getReg(0)});
auto ShiftedFlags = B.buildShl(S32, Flags, B.buildConstant(S32, 28));
auto ExtShiftedFlags =
B.buildMergeValues(S64, {Zero, ShiftedFlags.getReg(0)});
auto CombinedFields = B.buildOr(S64, NumRecordsRHS, ExtShiftedStride);
Register HighHalf =
B.buildOr(S64, CombinedFields, ExtShiftedFlags).getReg(0);
B.buildMergeValues(Result, {LowHalf, HighHalf});
} else {
NumRecords = B.buildTrunc(S32, NumRecords).getReg(0);
auto Unmerge = B.buildUnmerge(S32, Pointer);
auto LowHalf = Unmerge.getReg(0);
auto HighHalf = Unmerge.getReg(1);

auto AndMask = B.buildConstant(S32, 0x0000ffff);
auto Masked = B.buildAnd(S32, HighHalf, AndMask);
auto ShiftConst = B.buildConstant(S32, 16);
auto ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
auto NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
Register NewHighHalfReg = NewHighHalf.getReg(0);
B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
}
Register NewHighHalfReg = NewHighHalf.getReg(0);
B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});

MI.eraseFromParent();
return true;
}
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool UseBlockVGPROpsForCSR = false;
bool HasGloballyAddressableScratch = false;

bool Has45BitNumRecordsBufferResource = false;

// Dummy feature to use for assembler in tablegen.
bool FeatureDisable = false;

Expand Down Expand Up @@ -1849,6 +1851,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return 4;
return 3;
}

/// \returns true if the sub-target supports buffer resource (V#) with 45-bit
/// num_records.
bool has45BitNumRecordsBufferResource() const {
return Has45BitNumRecordsBufferResource;
}
};

class GCNUserSGPRUsageInfo {
Expand Down
74 changes: 53 additions & 21 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11602,29 +11602,61 @@ SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
SDValue NumRecords = Op->getOperand(3);
SDValue Flags = Op->getOperand(4);

auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
std::optional<uint32_t> ConstStride = std::nullopt;
if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
ConstStride = ConstNode->getZExtValue();

SDValue NewHighHalf = Masked;
if (!ConstStride || *ConstStride != 0) {
SDValue ShiftedStride;
if (ConstStride) {
ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
} else {
SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
ShiftedStride =
DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
DAG.getShiftAmountConstant(16, MVT::i32, Loc));
}
NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
SDValue Rsrc;

if (Subtarget->has45BitNumRecordsBufferResource()) {
SDValue Zero = DAG.getConstant(0, Loc, MVT::i32);
// Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
// num_records.
SDValue ExtPointer = DAG.getAnyExtOrTrunc(Pointer, Loc, MVT::i64);
SDValue NumRecordsLHS =
DAG.getNode(ISD::SHL, Loc, MVT::i64, NumRecords,
DAG.getShiftAmountConstant(57, MVT::i32, Loc));
SDValue LowHalf =
DAG.getNode(ISD::OR, Loc, MVT::i64, ExtPointer, NumRecordsLHS);

// Build the higher 64-bit value, which has the higher 38-bit num_records,
// 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
SDValue NumRecordsRHS =
DAG.getNode(ISD::SRL, Loc, MVT::i64, NumRecords,
DAG.getShiftAmountConstant(7, MVT::i32, Loc));
SDValue ShiftedStride =
DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
DAG.getShiftAmountConstant(12, MVT::i32, Loc));
SDValue ExtShiftedStrideVec =
DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedStride);
SDValue ExtShiftedStride =
DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedStrideVec);
SDValue ShiftedFlags =
DAG.getNode(ISD::SHL, Loc, MVT::i32, Flags,
DAG.getShiftAmountConstant(28, MVT::i32, Loc));
SDValue ExtShiftedFlagsVec =
DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedFlags);
SDValue ExtShiftedFlags =
DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedFlagsVec);
SDValue CombinedFields =
DAG.getNode(ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
SDValue HighHalf =
DAG.getNode(ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);

Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i64, LowHalf, HighHalf);
} else {
NumRecords = DAG.getAnyExtOrTrunc(NumRecords, Loc, MVT::i32);
auto [LowHalf, HighHalf] =
DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
SDValue ShiftedStride =
DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
DAG.getShiftAmountConstant(16, MVT::i32, Loc));
SDValue NewHighHalf =
DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);

Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, NewHighHalf,
NumRecords, Flags);
}

SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
NewHighHalf, NumRecords, Flags);
SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
return RsrcPtr;
}
Expand Down
Loading