Skip to content

Commit 2195fe7

Browse files
shiltiankrzysz00
andauthored
[AMDGPU] Add the support for 45-bit buffer resource (#159702)
On new targets like `gfx1250`, the buffer resource (V#) now uses this format: ``` base (57-bit): resource[56:0] num_records (45-bit): resource[101:57] reserved (6-bit): resource[107:102] stride (14-bit): resource[121:108] ``` This PR changes the type of `num_records` from `i32` to `i64` in both builtin and intrinsic, and also adds the support for lowering the new format. Fixes SWDEV-554034. --------- Co-authored-by: Krzysztof Drewniak <[email protected]>
1 parent 92e5060 commit 2195fe7

File tree

27 files changed

+942
-242
lines changed

27 files changed

+942
-242
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ BUILTIN(__builtin_amdgcn_qsad_pk_u16_u8, "WUiWUiUiWUi", "nc")
163163
BUILTIN(__builtin_amdgcn_mqsad_pk_u16_u8, "WUiWUiUiWUi", "nc")
164164
BUILTIN(__builtin_amdgcn_mqsad_u32_u8, "V4UiWUiUiV4Ui", "nc")
165165

166-
BUILTIN(__builtin_amdgcn_make_buffer_rsrc, "Qbv*sii", "nc")
166+
BUILTIN(__builtin_amdgcn_make_buffer_rsrc, "Qbv*sWii", "nc")
167167
BUILTIN(__builtin_amdgcn_raw_buffer_store_b8, "vUcQbiiIi", "n")
168168
BUILTIN(__builtin_amdgcn_raw_buffer_store_b16, "vUsQbiiIi", "n")
169169
BUILTIN(__builtin_amdgcn_raw_buffer_store_b32, "vUiQbiiIi", "n")

clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,9 @@
2424
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
2525
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
2626
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
27+
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP2]] to i64
2728
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
28-
// CHECK-NEXT: [[TMP4:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]])
29+
// CHECK-NEXT: [[TMP4:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i64 [[CONV]], i32 [[TMP3]])
2930
// CHECK-NEXT: ret ptr addrspace(8) [[TMP4]]
3031
//
3132
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, int num, int flags) {
@@ -48,8 +49,9 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short
4849
// CHECK-NEXT: store i32 [[FLAGS]], ptr [[FLAGS_ADDR_ASCAST]], align 4
4950
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
5051
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
52+
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP1]] to i64
5153
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
52-
// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 4, i32 [[TMP1]], i32 [[TMP2]])
54+
// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 4, i64 [[CONV]], i32 [[TMP2]])
5355
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
5456
//
5557
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p, int num, int flags) {
@@ -73,7 +75,7 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constan
7375
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
7476
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
7577
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
76-
// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 1234, i32 [[TMP2]])
78+
// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i64 1234, i32 [[TMP2]])
7779
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
7880
//
7981
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, short stride, int flags) {
@@ -97,7 +99,8 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(v
9799
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
98100
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
99101
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
100-
// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 5678)
102+
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP2]] to i64
103+
// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i64 [[CONV]], i32 5678)
101104
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
102105
//
103106
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, short stride, int num) {

clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44

55
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0(
66
// CHECK-NEXT: entry:
7-
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
7+
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
8+
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
89
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
910
//
1011
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, int num, int flags) {
@@ -13,7 +14,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, in
1314

1415
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_stride_constant(
1516
// CHECK-NEXT: entry:
16-
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
17+
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
18+
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 4, i64 [[CONV]], i32 [[FLAGS:%.*]])
1719
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
1820
//
1921
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p, int num, int flags) {
@@ -22,7 +24,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p,
2224

2325
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_num_constant(
2426
// CHECK-NEXT: entry:
25-
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
27+
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i64 1234, i32 [[FLAGS:%.*]])
2628
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
2729
//
2830
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, short stride, int flags) {
@@ -31,7 +33,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, sho
3133

3234
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_flags_constant(
3335
// CHECK-NEXT: entry:
34-
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
36+
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
37+
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 5678)
3538
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
3639
//
3740
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, short stride, int num) {
@@ -40,7 +43,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, s
4043

4144
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1(
4245
// CHECK-NEXT: entry:
43-
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
46+
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
47+
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
4448
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
4549
//
4650
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1(global void *p, short stride, int num, int flags) {
@@ -49,7 +53,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1(global void *p, short str
4953

5054
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_stride_constant(
5155
// CHECK-NEXT: entry:
52-
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
56+
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
57+
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 4, i64 [[CONV]], i32 [[FLAGS:%.*]])
5358
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
5459
//
5560
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_stride_constant(global void *p, int num, int flags) {
@@ -58,7 +63,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_stride_constant(global vo
5863

5964
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_num_constant(
6065
// CHECK-NEXT: entry:
61-
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
66+
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i64 1234, i32 [[FLAGS:%.*]])
6267
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
6368
//
6469
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_num_constant(global void *p, short stride, int flags) {
@@ -67,7 +72,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_num_constant(global void
6772

6873
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_flags_constant(
6974
// CHECK-NEXT: entry:
70-
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
75+
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
76+
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i64 [[CONV]], i32 5678)
7177
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
7278
//
7379
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_flags_constant(global void *p, short stride, int num) {
@@ -76,7 +82,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_flags_constant(global voi
7682

7783
// CHECK-LABEL: @test_amdgcn_make_buffer_p0_nullptr(
7884
// CHECK-NEXT: entry:
79-
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
85+
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
86+
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr null, i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
8087
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
8188
//
8289
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p0_nullptr(short stride, int num, int flags) {
@@ -85,7 +92,8 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p0_nullptr(short stride, int num,
8592

8693
// CHECK-LABEL: @test_amdgcn_make_buffer_p1_nullptr(
8794
// CHECK-NEXT: entry:
88-
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
95+
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[NUM:%.*]] to i64
96+
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) null, i16 [[STRIDE:%.*]], i64 [[CONV]], i32 [[FLAGS:%.*]])
8997
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
9098
//
9199
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p1_nullptr(short stride, int num, int flags) {

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1431,7 +1431,7 @@ def int_amdgcn_make_buffer_rsrc : DefaultAttrsIntrinsic <
14311431
[llvm_anyptr_ty],
14321432
[llvm_anyptr_ty, // base
14331433
llvm_i16_ty, // stride (and swizzle control)
1434-
llvm_i32_ty, // NumRecords / extent
1434+
llvm_i64_ty, // NumRecords / extent
14351435
llvm_i32_ty], // flags
14361436
// Attributes lifted from ptrmask + some extra argument attributes.
14371437
[IntrNoMem, ReadNone<ArgIndex<0>>,

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1443,6 +1443,12 @@ def FeatureLdsBarrierArriveAtomic : SubtargetFeature< "lds-barrier-arrive-atomic
14431443
"Has LDS barrier-arrive atomic instructions"
14441444
>;
14451445

1446+
def Feature45BitNumRecordsBufferResource : SubtargetFeature< "45-bit-num-records-buffer-resource",
1447+
"Has45BitNumRecordsBufferResource",
1448+
"true",
1449+
"The buffer resource (V#) supports 45-bit num_records"
1450+
>;
1451+
14461452
// Dummy feature used to disable assembler instructions.
14471453
def FeatureDisable : SubtargetFeature<"",
14481454
"FeatureDisable","true",
@@ -2106,6 +2112,7 @@ def FeatureISAVersion12_50 : FeatureSet<
21062112
FeatureMadU32Inst,
21072113
FeatureLdsBarrierArriveAtomic,
21082114
FeatureSetPrioIncWgInst,
2115+
Feature45BitNumRecordsBufferResource,
21092116
]>;
21102117

21112118
def FeatureISAVersion12_51 : FeatureSet<

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 41 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -5905,33 +5905,50 @@ bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
59055905
Register Flags = MI.getOperand(5).getReg();
59065906

59075907
LLT S32 = LLT::scalar(32);
5908+
LLT S64 = LLT::scalar(64);
59085909

59095910
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5910-
auto Unmerge = B.buildUnmerge(S32, Pointer);
5911-
Register LowHalf = Unmerge.getReg(0);
5912-
Register HighHalf = Unmerge.getReg(1);
5913-
5914-
auto AndMask = B.buildConstant(S32, 0x0000ffff);
5915-
auto Masked = B.buildAnd(S32, HighHalf, AndMask);
5916-
5917-
MachineInstrBuilder NewHighHalf = Masked;
5918-
std::optional<ValueAndVReg> StrideConst =
5919-
getIConstantVRegValWithLookThrough(Stride, MRI);
5920-
if (!StrideConst || !StrideConst->Value.isZero()) {
5921-
MachineInstrBuilder ShiftedStride;
5922-
if (StrideConst) {
5923-
uint32_t StrideVal = StrideConst->Value.getZExtValue();
5924-
uint32_t ShiftedStrideVal = StrideVal << 16;
5925-
ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
5926-
} else {
5927-
auto ExtStride = B.buildAnyExt(S32, Stride);
5928-
auto ShiftConst = B.buildConstant(S32, 16);
5929-
ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
5930-
}
5931-
NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
5911+
5912+
auto ExtStride = B.buildAnyExt(S32, Stride);
5913+
5914+
if (ST.has45BitNumRecordsBufferResource()) {
5915+
Register Zero = B.buildConstant(S32, 0).getReg(0);
5916+
// Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
5917+
// num_records.
5918+
LLT PtrIntTy = LLT::scalar(MRI.getType(Pointer).getSizeInBits());
5919+
auto PointerInt = B.buildPtrToInt(PtrIntTy, Pointer);
5920+
auto ExtPointer = B.buildAnyExtOrTrunc(S64, PointerInt);
5921+
auto NumRecordsLHS = B.buildShl(S64, NumRecords, B.buildConstant(S32, 57));
5922+
Register LowHalf = B.buildOr(S64, ExtPointer, NumRecordsLHS).getReg(0);
5923+
5924+
// Build the higher 64-bit value, which has the higher 38-bit num_records,
5925+
// 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
5926+
auto NumRecordsRHS = B.buildLShr(S64, NumRecords, B.buildConstant(S32, 7));
5927+
auto ShiftedStride = B.buildShl(S32, ExtStride, B.buildConstant(S32, 12));
5928+
auto ExtShiftedStride =
5929+
B.buildMergeValues(S64, {Zero, ShiftedStride.getReg(0)});
5930+
auto ShiftedFlags = B.buildShl(S32, Flags, B.buildConstant(S32, 28));
5931+
auto ExtShiftedFlags =
5932+
B.buildMergeValues(S64, {Zero, ShiftedFlags.getReg(0)});
5933+
auto CombinedFields = B.buildOr(S64, NumRecordsRHS, ExtShiftedStride);
5934+
Register HighHalf =
5935+
B.buildOr(S64, CombinedFields, ExtShiftedFlags).getReg(0);
5936+
B.buildMergeValues(Result, {LowHalf, HighHalf});
5937+
} else {
5938+
NumRecords = B.buildTrunc(S32, NumRecords).getReg(0);
5939+
auto Unmerge = B.buildUnmerge(S32, Pointer);
5940+
auto LowHalf = Unmerge.getReg(0);
5941+
auto HighHalf = Unmerge.getReg(1);
5942+
5943+
auto AndMask = B.buildConstant(S32, 0x0000ffff);
5944+
auto Masked = B.buildAnd(S32, HighHalf, AndMask);
5945+
auto ShiftConst = B.buildConstant(S32, 16);
5946+
auto ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
5947+
auto NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
5948+
Register NewHighHalfReg = NewHighHalf.getReg(0);
5949+
B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
59325950
}
5933-
Register NewHighHalfReg = NewHighHalf.getReg(0);
5934-
B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5951+
59355952
MI.eraseFromParent();
59365953
return true;
59375954
}

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
285285
bool UseBlockVGPROpsForCSR = false;
286286
bool HasGloballyAddressableScratch = false;
287287

288+
bool Has45BitNumRecordsBufferResource = false;
289+
288290
// Dummy feature to use for assembler in tablegen.
289291
bool FeatureDisable = false;
290292

@@ -1849,6 +1851,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
18491851
return 4;
18501852
return 3;
18511853
}
1854+
1855+
/// \returns true if the sub-target supports buffer resource (V#) with 45-bit
1856+
/// num_records.
1857+
bool has45BitNumRecordsBufferResource() const {
1858+
return Has45BitNumRecordsBufferResource;
1859+
}
18521860
};
18531861

18541862
class GCNUserSGPRUsageInfo {

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 53 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -11586,29 +11586,61 @@ SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
1158611586
SDValue NumRecords = Op->getOperand(3);
1158711587
SDValue Flags = Op->getOperand(4);
1158811588

11589-
auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11590-
SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
11591-
SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
11592-
std::optional<uint32_t> ConstStride = std::nullopt;
11593-
if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
11594-
ConstStride = ConstNode->getZExtValue();
11595-
11596-
SDValue NewHighHalf = Masked;
11597-
if (!ConstStride || *ConstStride != 0) {
11598-
SDValue ShiftedStride;
11599-
if (ConstStride) {
11600-
ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
11601-
} else {
11602-
SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
11603-
ShiftedStride =
11604-
DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11605-
DAG.getShiftAmountConstant(16, MVT::i32, Loc));
11606-
}
11607-
NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
11589+
SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
11590+
SDValue Rsrc;
11591+
11592+
if (Subtarget->has45BitNumRecordsBufferResource()) {
11593+
SDValue Zero = DAG.getConstant(0, Loc, MVT::i32);
11594+
// Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
11595+
// num_records.
11596+
SDValue ExtPointer = DAG.getAnyExtOrTrunc(Pointer, Loc, MVT::i64);
11597+
SDValue NumRecordsLHS =
11598+
DAG.getNode(ISD::SHL, Loc, MVT::i64, NumRecords,
11599+
DAG.getShiftAmountConstant(57, MVT::i32, Loc));
11600+
SDValue LowHalf =
11601+
DAG.getNode(ISD::OR, Loc, MVT::i64, ExtPointer, NumRecordsLHS);
11602+
11603+
// Build the higher 64-bit value, which has the higher 38-bit num_records,
11604+
// 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
11605+
SDValue NumRecordsRHS =
11606+
DAG.getNode(ISD::SRL, Loc, MVT::i64, NumRecords,
11607+
DAG.getShiftAmountConstant(7, MVT::i32, Loc));
11608+
SDValue ShiftedStride =
11609+
DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11610+
DAG.getShiftAmountConstant(12, MVT::i32, Loc));
11611+
SDValue ExtShiftedStrideVec =
11612+
DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedStride);
11613+
SDValue ExtShiftedStride =
11614+
DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedStrideVec);
11615+
SDValue ShiftedFlags =
11616+
DAG.getNode(ISD::SHL, Loc, MVT::i32, Flags,
11617+
DAG.getShiftAmountConstant(28, MVT::i32, Loc));
11618+
SDValue ExtShiftedFlagsVec =
11619+
DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedFlags);
11620+
SDValue ExtShiftedFlags =
11621+
DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedFlagsVec);
11622+
SDValue CombinedFields =
11623+
DAG.getNode(ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
11624+
SDValue HighHalf =
11625+
DAG.getNode(ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
11626+
11627+
Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i64, LowHalf, HighHalf);
11628+
} else {
11629+
NumRecords = DAG.getAnyExtOrTrunc(NumRecords, Loc, MVT::i32);
11630+
auto [LowHalf, HighHalf] =
11631+
DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11632+
SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
11633+
SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
11634+
SDValue ShiftedStride =
11635+
DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11636+
DAG.getShiftAmountConstant(16, MVT::i32, Loc));
11637+
SDValue NewHighHalf =
11638+
DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
11639+
11640+
Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, NewHighHalf,
11641+
NumRecords, Flags);
1160811642
}
1160911643

11610-
SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
11611-
NewHighHalf, NumRecords, Flags);
1161211644
SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
1161311645
return RsrcPtr;
1161411646
}

0 commit comments

Comments
 (0)