Skip to content

Commit e97fb22

Browse files
arsenmsrpande
andauthored
AMDGPU: Add support for load transpose instructions for gfx950 (#117378)
This patch support for intrinsics in clang, as well as assembly instructions in the backend. Co-authored-by: Sirish Pande <[email protected]>
1 parent d88ed93 commit e97fb22

File tree

13 files changed

+427
-3
lines changed

13 files changed

+427
-3
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,11 @@ TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8, "V16fV4iV8iV16fiIiI
462462
TARGET_BUILTIN(__builtin_amdgcn_permlane16_swap, "V2UiUiUiIbIb", "nc", "permlane16-swap")
463463
TARGET_BUILTIN(__builtin_amdgcn_permlane32_swap, "V2UiUiUiIbIb", "nc", "permlane32-swap")
464464

465+
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr4_b64_v2i32, "V2iV2i*3", "nc", "gfx950-insts")
466+
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr6_b96_v3i32, "V3iV3i*3", "nc", "gfx950-insts")
467+
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr8_b64_v2i32, "V2iV2i*3", "nc", "gfx950-insts")
468+
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4i16, "V4sV4s*3", "nc", "gfx950-insts")
469+
465470
//===----------------------------------------------------------------------===//
466471
// GFX12+ only builtins.
467472
//===----------------------------------------------------------------------===//

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19697,8 +19697,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1969719697
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
1969819698
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
1969919699
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
19700-
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: {
19701-
19700+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
19701+
case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
19702+
case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
19703+
case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
19704+
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: {
1970219705
Intrinsic::ID IID;
1970319706
switch (BuiltinID) {
1970419707
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
@@ -19713,6 +19716,18 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1971319716
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
1971419717
IID = Intrinsic::amdgcn_global_load_tr_b128;
1971519718
break;
19719+
case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
19720+
IID = Intrinsic::amdgcn_ds_read_tr4_b64;
19721+
break;
19722+
case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
19723+
IID = Intrinsic::amdgcn_ds_read_tr8_b64;
19724+
break;
19725+
case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
19726+
IID = Intrinsic::amdgcn_ds_read_tr6_b96;
19727+
break;
19728+
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16:
19729+
IID = Intrinsic::amdgcn_ds_read_tr16_b64;
19730+
break;
1971619731
}
1971719732
llvm::Type *LoadTy = ConvertType(E->getType());
1971819733
llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
2+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950 -emit-llvm -o - %s | FileCheck --check-prefix=GFX950 %s
3+
4+
typedef int v2i __attribute__((ext_vector_type(2)));
5+
typedef int v3i __attribute__((ext_vector_type(3)));
6+
typedef short v4s __attribute__((ext_vector_type(4)));
7+
8+
// GFX950-LABEL: define dso_local <2 x i32> @test_amdgcn_ds_read_b64_tr_b4_v2i32(
9+
// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
10+
// GFX950-NEXT: entry:
11+
// GFX950-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) [[INPTR]])
12+
// GFX950-NEXT: ret <2 x i32> [[TMP0]]
13+
//
14+
v2i test_amdgcn_ds_read_b64_tr_b4_v2i32(local v2i* inptr)
15+
{
16+
return __builtin_amdgcn_ds_read_tr4_b64_v2i32(inptr);
17+
}
18+
19+
// GFX950-LABEL: define dso_local <3 x i32> @test_amdgcn_ds_read_b96_tr_b6_v3i32(
20+
// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
21+
// GFX950-NEXT: entry:
22+
// GFX950-NEXT: [[TMP0:%.*]] = tail call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3) [[INPTR]])
23+
// GFX950-NEXT: ret <3 x i32> [[TMP0]]
24+
//
25+
v3i test_amdgcn_ds_read_b96_tr_b6_v3i32(local v3i* inptr)
26+
{
27+
return __builtin_amdgcn_ds_read_tr6_b96_v3i32(inptr);
28+
}
29+
30+
// GFX950-LABEL: define dso_local <2 x i32> @test_amdgcn_ds_read_b64_tr_b8_v2i32(
31+
// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
32+
// GFX950-NEXT: entry:
33+
// GFX950-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3) [[INPTR]])
34+
// GFX950-NEXT: ret <2 x i32> [[TMP0]]
35+
//
36+
v2i test_amdgcn_ds_read_b64_tr_b8_v2i32(local v2i* inptr)
37+
{
38+
return __builtin_amdgcn_ds_read_tr8_b64_v2i32(inptr);
39+
}
40+
41+
// GFX950-LABEL: define dso_local <4 x i16> @test_amdgcn_ds_read_b64_tr_b16_v2i16(
42+
// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
43+
// GFX950-NEXT: entry:
44+
// GFX950-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16(ptr addrspace(3) [[INPTR]])
45+
// GFX950-NEXT: ret <4 x i16> [[TMP0]]
46+
//
47+
v4s test_amdgcn_ds_read_b64_tr_b16_v2i16(local v4s* inptr)
48+
{
49+
return __builtin_amdgcn_ds_read_tr16_b64_v4i16(inptr);
50+
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2726,6 +2726,10 @@ class AMDGPULoadIntrinsic<LLVMType ptr_ty>:
27262726

27272727
def int_amdgcn_global_load_tr_b64 : AMDGPULoadIntrinsic<global_ptr_ty>;
27282728
def int_amdgcn_global_load_tr_b128 : AMDGPULoadIntrinsic<global_ptr_ty>;
2729+
def int_amdgcn_ds_read_tr4_b64 : AMDGPULoadIntrinsic<local_ptr_ty>;
2730+
def int_amdgcn_ds_read_tr6_b96 : AMDGPULoadIntrinsic<local_ptr_ty>;
2731+
def int_amdgcn_ds_read_tr8_b64 : AMDGPULoadIntrinsic<local_ptr_ty>;
2732+
def int_amdgcn_ds_read_tr16_b64 : AMDGPULoadIntrinsic<local_ptr_ty>;
27292733

27302734
// i32 @llvm.amdgcn.wave.id()
27312735
def int_amdgcn_wave_id :

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4967,6 +4967,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
49674967
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
49684968
case Intrinsic::amdgcn_global_load_tr_b64:
49694969
case Intrinsic::amdgcn_global_load_tr_b128:
4970+
case Intrinsic::amdgcn_ds_read_tr4_b64:
4971+
case Intrinsic::amdgcn_ds_read_tr6_b96:
4972+
case Intrinsic::amdgcn_ds_read_tr8_b64:
4973+
case Intrinsic::amdgcn_ds_read_tr16_b64:
49704974
return getDefaultMappingAllVGPR(MI);
49714975
case Intrinsic::amdgcn_ds_ordered_add:
49724976
case Intrinsic::amdgcn_ds_ordered_swap: {

llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,11 @@ def : SourceOfDivergence<intr>;
345345
def : SourceOfDivergence<int_amdgcn_global_load_tr_b64>;
346346
def : SourceOfDivergence<int_amdgcn_global_load_tr_b128>;
347347

348+
def : SourceOfDivergence<int_amdgcn_ds_read_tr4_b64>;
349+
def : SourceOfDivergence<int_amdgcn_ds_read_tr6_b96>;
350+
def : SourceOfDivergence<int_amdgcn_ds_read_tr8_b64>;
351+
def : SourceOfDivergence<int_amdgcn_ds_read_tr16_b64>;
352+
348353
// The dummy boolean output is divergent from the IR's perspective,
349354
// but the mask results are uniform. These produce a divergent and
350355
// uniform result, so the returned struct is collectively divergent.

llvm/lib/Target/AMDGPU/DSInstructions.td

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,12 @@ multiclass DS_1A_RET_mc<string opName, RegisterClass rc = VGPR_32, bit HasTiedOu
294294
}
295295
}
296296

297+
multiclass DS_1A_RET_NoM0<string opName, RegisterClass rc = VGPR_32> {
298+
let has_m0_read = 0 in {
299+
def "" : DS_1A_RET<opName, rc>;
300+
}
301+
}
302+
297303
class DS_1A_RET_Tied<string opName, RegisterClass rc = VGPR_32> :
298304
DS_1A_RET<opName, rc, 1>;
299305

@@ -744,6 +750,13 @@ multiclass DSAtomicRetNoRetPatIntrinsic_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
744750
defm : DSAtomicRetNoRetPatIntrinsic_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "int_amdgcn_atomic_cond_sub_u32">;
745751
} // let SubtargetPredicate = isGFX12Plus
746752

753+
let WaveSizePredicate = isWave64, SubtargetPredicate = HasGFX950Insts, mayStore = 0 in {
754+
defm DS_READ_B64_TR_B4 : DS_1A_RET_NoM0<"ds_read_b64_tr_b4", VReg_64>;
755+
defm DS_READ_B64_TR_B8 : DS_1A_RET_NoM0<"ds_read_b64_tr_b8", VReg_64>;
756+
defm DS_READ_B64_TR_B16 : DS_1A_RET_NoM0<"ds_read_b64_tr_b16", VReg_64>;
757+
defm DS_READ_B96_TR_B6 : DS_1A_RET_NoM0<"ds_read_b96_tr_b6", VReg_96>;
758+
}
759+
747760
//===----------------------------------------------------------------------===//
748761
// DS Patterns
749762
//===----------------------------------------------------------------------===//
@@ -1179,6 +1192,18 @@ def : GCNPat <
11791192
sub0)
11801193
>;
11811194

1195+
class DSLoadTrPat <DS_Pseudo inst, ValueType vt, SDPatternOperator node> : GCNPat <
1196+
(vt (node (DS1Addr1Offset i32:$ptr, i32:$offset))),
1197+
(inst $ptr, Offset:$offset, (i1 0))
1198+
>;
1199+
1200+
let SubtargetPredicate = HasGFX950Insts in {
1201+
def : DSLoadTrPat <DS_READ_B64_TR_B4, v2i32, int_amdgcn_ds_read_tr4_b64>;
1202+
def : DSLoadTrPat <DS_READ_B64_TR_B8, v2i32, int_amdgcn_ds_read_tr8_b64>;
1203+
def : DSLoadTrPat <DS_READ_B96_TR_B6, v3i32, int_amdgcn_ds_read_tr6_b96>;
1204+
def : DSLoadTrPat <DS_READ_B64_TR_B16, v4i16, int_amdgcn_ds_read_tr16_b64>;
1205+
}
1206+
11821207
//===----------------------------------------------------------------------===//
11831208
// Target-specific instruction encodings.
11841209
//===----------------------------------------------------------------------===//
@@ -1748,3 +1773,11 @@ def DS_PK_ADD_F16_vi : DS_Real_vi<0x17, DS_PK_ADD_F16>;
17481773
def DS_PK_ADD_RTN_F16_vi : DS_Real_vi<0xb7, DS_PK_ADD_RTN_F16>;
17491774
def DS_PK_ADD_BF16_vi : DS_Real_vi<0x18, DS_PK_ADD_BF16>;
17501775
def DS_PK_ADD_RTN_BF16_vi : DS_Real_vi<0xb8, DS_PK_ADD_RTN_BF16>;
1776+
1777+
//===----------------------------------------------------------------------===//
1778+
// GFX950.
1779+
//===----------------------------------------------------------------------===//
1780+
def DS_READ_B64_TR_B4_vi : DS_Real_vi<0x0e0, DS_READ_B64_TR_B4>;
1781+
def DS_READ_B96_TR_B6_vi : DS_Real_vi<0x0e1, DS_READ_B96_TR_B6>;
1782+
def DS_READ_B64_TR_B8_vi : DS_Real_vi<0x0e2, DS_READ_B64_TR_B8>;
1783+
def DS_READ_B64_TR_B16_vi : DS_Real_vi<0x0e3, DS_READ_B64_TR_B16>;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1382,7 +1382,11 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
13821382
return true;
13831383
}
13841384
case Intrinsic::amdgcn_global_load_tr_b64:
1385-
case Intrinsic::amdgcn_global_load_tr_b128: {
1385+
case Intrinsic::amdgcn_global_load_tr_b128:
1386+
case Intrinsic::amdgcn_ds_read_tr4_b64:
1387+
case Intrinsic::amdgcn_ds_read_tr6_b96:
1388+
case Intrinsic::amdgcn_ds_read_tr8_b64:
1389+
case Intrinsic::amdgcn_ds_read_tr16_b64: {
13861390
Info.opc = ISD::INTRINSIC_W_CHAIN;
13871391
Info.memVT = MVT::getVT(CI.getType());
13881392
Info.ptrVal = CI.getOperand(0);
@@ -1477,6 +1481,10 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
14771481
case Intrinsic::amdgcn_atomic_cond_sub_u32:
14781482
case Intrinsic::amdgcn_ds_append:
14791483
case Intrinsic::amdgcn_ds_consume:
1484+
case Intrinsic::amdgcn_ds_read_tr4_b64:
1485+
case Intrinsic::amdgcn_ds_read_tr6_b96:
1486+
case Intrinsic::amdgcn_ds_read_tr8_b64:
1487+
case Intrinsic::amdgcn_ds_read_tr16_b64:
14801488
case Intrinsic::amdgcn_ds_ordered_add:
14811489
case Intrinsic::amdgcn_ds_ordered_swap:
14821490
case Intrinsic::amdgcn_flat_atomic_fmax_num:

llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,50 @@ bb:
261261
ret void
262262
}
263263

264+
declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3))
265+
266+
; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) %gep)
267+
define amdgpu_kernel void @ds_read_b64_tr4_v2i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
268+
bb:
269+
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
270+
%tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) %gep)
271+
store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8
272+
ret void
273+
}
274+
275+
declare <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3))
276+
277+
; CHECK: DIVERGENT: %tmp0 = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3) %gep)
278+
define amdgpu_kernel void @ds_read_b96_tr6_v3i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
279+
bb:
280+
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
281+
%tmp0 = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3) %gep)
282+
store <3 x i32> %tmp0, ptr addrspace(1) %out, align 16
283+
ret void
284+
}
285+
286+
declare <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3))
287+
288+
; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3) %gep)
289+
define amdgpu_kernel void @ds_read_b64_tr8_v2i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
290+
bb:
291+
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
292+
%tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3) %gep)
293+
store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8
294+
ret void
295+
}
296+
297+
declare <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16(ptr addrspace(3))
298+
299+
; CHECK: DIVERGENT: %tmp0 = call <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16(ptr addrspace(3) %gep)
300+
define amdgpu_kernel void @ds_read_b64_tr_b16_v4i16(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
301+
bb:
302+
%gep = getelementptr i64, ptr addrspace(3) %addr, i16 4
303+
%tmp0 = call <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16(ptr addrspace(3) %gep)
304+
store <4 x i16> %tmp0, ptr addrspace(1) %out, align 16
305+
ret void
306+
}
307+
264308
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
265309
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg)
266310

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-SDAG %s
3+
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-GISEL %s
4+
5+
declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32.p3(ptr addrspace(3))
6+
declare <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32.p3(ptr addrspace(3))
7+
declare <3 x i32> @llvm.amdgcn.ds.read.tr6.b64.v3i32.p3(ptr addrspace(3))
8+
declare <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16.p3(ptr addrspace(3))
9+
10+
define amdgpu_ps void @ds_read_b64_tr_b4(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
11+
; GFX950-SDAG-LABEL: ds_read_b64_tr_b4:
12+
; GFX950-SDAG: ; %bb.0: ; %entry
13+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
14+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
15+
; GFX950-SDAG-NEXT: ds_read_b64_tr_b4 v[0:1], v0 offset:32
16+
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
17+
; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
18+
; GFX950-SDAG-NEXT: s_endpgm
19+
;
20+
; GFX950-GISEL-LABEL: ds_read_b64_tr_b4:
21+
; GFX950-GISEL: ; %bb.0: ; %entry
22+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
23+
; GFX950-GISEL-NEXT: ds_read_b64_tr_b4 v[0:1], v0 offset:32
24+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
25+
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
26+
; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
27+
; GFX950-GISEL-NEXT: s_endpgm
28+
entry:
29+
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
30+
%val = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32.p3(ptr addrspace(3) %gep)
31+
store <2 x i32> %val, ptr addrspace(1) %use
32+
ret void
33+
}
34+
35+
define amdgpu_ps void @ds_read_b96_tr_b6(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
36+
; GFX950-SDAG-LABEL: ds_read_b96_tr_b6:
37+
; GFX950-SDAG: ; %bb.0: ; %entry
38+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v2
39+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v1
40+
; GFX950-SDAG-NEXT: ds_read_b96_tr_b6 v[0:2], v0 offset:32
41+
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
42+
; GFX950-SDAG-NEXT: global_store_dwordx3 v[4:5], v[0:2], off
43+
; GFX950-SDAG-NEXT: s_endpgm
44+
;
45+
; GFX950-GISEL-LABEL: ds_read_b96_tr_b6:
46+
; GFX950-GISEL: ; %bb.0: ; %entry
47+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
48+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
49+
; GFX950-GISEL-NEXT: ds_read_b96_tr_b6 v[0:2], v0 offset:32
50+
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
51+
; GFX950-GISEL-NEXT: global_store_dwordx3 v[4:5], v[0:2], off
52+
; GFX950-GISEL-NEXT: s_endpgm
53+
entry:
54+
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
55+
%val = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32.p3(ptr addrspace(3) %gep)
56+
store <3 x i32> %val, ptr addrspace(1) %use
57+
ret void
58+
}
59+
60+
define amdgpu_ps void @ds_read_b64_tr_b8(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
61+
; GFX950-SDAG-LABEL: ds_read_b64_tr_b8:
62+
; GFX950-SDAG: ; %bb.0: ; %entry
63+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
64+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
65+
; GFX950-SDAG-NEXT: ds_read_b64_tr_b8 v[0:1], v0 offset:32
66+
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
67+
; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
68+
; GFX950-SDAG-NEXT: s_endpgm
69+
;
70+
; GFX950-GISEL-LABEL: ds_read_b64_tr_b8:
71+
; GFX950-GISEL: ; %bb.0: ; %entry
72+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
73+
; GFX950-GISEL-NEXT: ds_read_b64_tr_b8 v[0:1], v0 offset:32
74+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
75+
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
76+
; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
77+
; GFX950-GISEL-NEXT: s_endpgm
78+
entry:
79+
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
80+
%val = call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32.p3(ptr addrspace(3) %gep)
81+
store <2 x i32> %val, ptr addrspace(1) %use
82+
ret void
83+
}
84+
85+
define amdgpu_ps void @ds_read_b64_tr_b16(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
86+
; GFX950-SDAG-LABEL: ds_read_b64_tr_b16:
87+
; GFX950-SDAG: ; %bb.0: ; %entry
88+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
89+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
90+
; GFX950-SDAG-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32
91+
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
92+
; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
93+
; GFX950-SDAG-NEXT: s_endpgm
94+
;
95+
; GFX950-GISEL-LABEL: ds_read_b64_tr_b16:
96+
; GFX950-GISEL: ; %bb.0: ; %entry
97+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
98+
; GFX950-GISEL-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32
99+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
100+
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
101+
; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
102+
; GFX950-GISEL-NEXT: s_endpgm
103+
entry:
104+
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
105+
%val = call <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16.p3(ptr addrspace(3) %gep)
106+
store <4 x i16> %val, ptr addrspace(1) %use
107+
ret void
108+
}

0 commit comments

Comments
 (0)