Skip to content

Commit f0c1f2b

Browse files
authored
SWDEV-527443 - Port NV4x Intrinsic changes for ROCm 7.0 (llvm#2801)
1 parent 7bd7e24 commit f0c1f2b

28 files changed

+825
-73
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,6 @@ TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_h, "V4UiUifV4fV4hV4hV4Ui
300300
TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_l, "V4UiWUifV4fV4fV4fV4Ui", "nc", "gfx10-insts")
301301
TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_lh, "V4UiWUifV4fV4hV4hV4Ui", "nc", "gfx10-insts")
302302

303-
304303
//===----------------------------------------------------------------------===//
305304
// GFX11+ only builtins.
306305
//===----------------------------------------------------------------------===//
@@ -309,6 +308,8 @@ TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_lh, "V4UiWUifV4fV4hV4hV4
309308
TARGET_BUILTIN(__builtin_amdgcn_permlane64, "UiUi", "nc", "gfx11-insts")
310309
TARGET_BUILTIN(__builtin_amdgcn_s_wait_event_export_ready, "v", "n", "gfx11-insts")
311310

311+
TARGET_BUILTIN(__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn, "V2UiUiUiV4UiIi", "n", "gfx11-insts")
312+
312313
//===----------------------------------------------------------------------===//
313314
// WMMA builtins.
314315
// Postfix w32 indicates the builtin requires wavefront size of 32.
@@ -503,6 +504,16 @@ TARGET_BUILTIN(__builtin_amdgcn_s_get_named_barrier_state, "Uiv*", "n", "gfx12-i
503504
TARGET_BUILTIN(__builtin_amdgcn_s_prefetch_data, "vvC*Ui", "nc", "gfx12-insts")
504505
TARGET_BUILTIN(__builtin_amdgcn_s_buffer_prefetch_data, "vQbIiUi", "nc", "gfx12-insts")
505506

507+
// For the following two builtins, the second and third return values of the
508+
// intrinsics are returned through the last two pointer-type function arguments.
509+
TARGET_BUILTIN(__builtin_amdgcn_image_bvh8_intersect_ray, "V10UiWUifUcV3fV3fUiV4UiV3f*V3f*", "nc", "gfx12-insts")
510+
TARGET_BUILTIN(__builtin_amdgcn_image_bvh_dual_intersect_ray, "V10UiWUifUcV3fV3fV2UiV4UiV3f*V3f*", "nc", "gfx12-insts")
511+
TARGET_BUILTIN(__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn, "V2UiUiUiV8UiIi", "n", "gfx12-insts")
512+
513+
// The intrinsic returns {i64, i32}, the builtin returns <2 x i64>.
514+
// The second return value of the intrinsic is zext'ed.
515+
TARGET_BUILTIN(__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn, "V2WUiUiUiV8UiIi", "n", "gfx12-insts")
516+
506517
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32")
507518
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32")
508519
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32")

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 64 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20175,19 +20175,81 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
2017520175
return Builder.CreateCall(F, {NodePtr, RayExtent, RayOrigin, RayDir,
2017620176
RayInverseDir, TextureDescr});
2017720177
}
20178+
case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
20179+
case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray: {
20180+
Intrinsic::ID IID;
20181+
switch (BuiltinID) {
20182+
case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
20183+
IID = Intrinsic::amdgcn_image_bvh8_intersect_ray;
20184+
break;
20185+
case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray:
20186+
IID = Intrinsic::amdgcn_image_bvh_dual_intersect_ray;
20187+
break;
20188+
}
20189+
llvm::Value *NodePtr = EmitScalarExpr(E->getArg(0));
20190+
llvm::Value *RayExtent = EmitScalarExpr(E->getArg(1));
20191+
llvm::Value *InstanceMask = EmitScalarExpr(E->getArg(2));
20192+
llvm::Value *RayOrigin = EmitScalarExpr(E->getArg(3));
20193+
llvm::Value *RayDir = EmitScalarExpr(E->getArg(4));
20194+
llvm::Value *Offset = EmitScalarExpr(E->getArg(5));
20195+
llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(6));
20196+
20197+
Address RetRayOriginPtr = EmitPointerWithAlignment(E->getArg(7));
20198+
Address RetRayDirPtr = EmitPointerWithAlignment(E->getArg(8));
20199+
20200+
llvm::Function *IntrinsicFunc = CGM.getIntrinsic(IID);
20201+
20202+
llvm::CallInst *CI = Builder.CreateCall(
20203+
IntrinsicFunc, {NodePtr, RayExtent, InstanceMask, RayOrigin, RayDir,
20204+
Offset, TextureDescr});
20205+
20206+
llvm::Value *RetVData = Builder.CreateExtractValue(CI, 0);
20207+
llvm::Value *RetRayOrigin = Builder.CreateExtractValue(CI, 1);
20208+
llvm::Value *RetRayDir = Builder.CreateExtractValue(CI, 2);
20209+
20210+
Builder.CreateStore(RetRayOrigin, RetRayOriginPtr);
20211+
Builder.CreateStore(RetRayDir, RetRayDirPtr);
20212+
20213+
return RetVData;
20214+
}
20215+
20216+
case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
20217+
case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
20218+
case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
20219+
case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn: {
20220+
Intrinsic::ID IID;
20221+
switch (BuiltinID) {
20222+
case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
20223+
IID = Intrinsic::amdgcn_ds_bvh_stack_rtn;
20224+
break;
20225+
case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
20226+
IID = Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn;
20227+
break;
20228+
case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
20229+
IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn;
20230+
break;
20231+
case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn:
20232+
IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn;
20233+
break;
20234+
}
2017820235

20179-
case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn: {
2018020236
SmallVector<Value *, 4> Args;
2018120237
for (int i = 0, e = E->getNumArgs(); i != e; ++i)
2018220238
Args.push_back(EmitScalarExpr(E->getArg(i)));
2018320239

20184-
Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ds_bvh_stack_rtn);
20240+
Function *F = CGM.getIntrinsic(IID);
2018520241
Value *Call = Builder.CreateCall(F, Args);
2018620242
Value *Rtn = Builder.CreateExtractValue(Call, 0);
2018720243
Value *A = Builder.CreateExtractValue(Call, 1);
2018820244
llvm::Type *RetTy = ConvertType(E->getType());
2018920245
Value *I0 = Builder.CreateInsertElement(PoisonValue::get(RetTy), Rtn,
2019020246
(uint64_t)0);
20247+
// ds_bvh_stack_push8_pop2_rtn returns {i64, i32} but the builtin returns
20248+
// <2 x i64>, zext the second value.
20249+
if (A->getType()->getPrimitiveSizeInBits() <
20250+
RetTy->getScalarType()->getPrimitiveSizeInBits())
20251+
A = Builder.CreateZExt(A, RetTy->getScalarType());
20252+
2019120253
return Builder.CreateInsertElement(I0, A, 1);
2019220254
}
2019320255
case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4:

clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@
33
// RUN: -emit-llvm -cl-std=CL2.0 -o - %s | FileCheck %s
44
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1030 -S \
55
// RUN: -cl-std=CL2.0 -o - %s | FileCheck -check-prefix=ISA %s
6+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -emit-llvm \
7+
// RUN: -cl-std=CL2.0 -o - %s | FileCheck -check-prefix=GFX12 %s
8+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S \
9+
// RUN: -cl-std=CL2.0 -o - %s | FileCheck -check-prefix=GFX12ISA %s
610

711
// Test llvm.amdgcn.image.bvh.intersect.ray intrinsic.
812

@@ -12,12 +16,18 @@
1216
// Postfix l indicates the 1st argument is i64 and postfix h indicates
1317
// the 4/5-th arguments are half4.
1418

19+
typedef unsigned char uchar;
1520
typedef unsigned int uint;
1621
typedef unsigned long ulong;
22+
typedef float float3 __attribute__((ext_vector_type(3)));
1723
typedef float float4 __attribute__((ext_vector_type(4)));
1824
typedef double double4 __attribute__((ext_vector_type(4)));
1925
typedef half half4 __attribute__((ext_vector_type(4)));
26+
typedef uint uint2 __attribute__((ext_vector_type(2)));
2027
typedef uint uint4 __attribute__((ext_vector_type(4)));
28+
typedef uint uint8 __attribute__((ext_vector_type(8)));
29+
typedef uint uint10 __attribute__((ext_vector_type(10)));
30+
typedef ulong ulong2 __attribute__((ext_vector_type(2)));
2131

2232
// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v3f32
2333
// ISA: image_bvh_intersect_ray
@@ -59,3 +69,71 @@ void test_image_bvh_intersect_ray_lh(global uint4* out, ulong node_ptr,
5969
ray_origin, ray_dir, ray_inv_dir, texture_descr);
6070
}
6171

72+
#if __has_builtin(__builtin_amdgcn_image_bvh8_intersect_ray)
73+
// GFX12: call { <10 x i32>, <3 x float>, <3 x float> } @llvm.amdgcn.image.bvh8.intersect.ray(
74+
// GFX12: i64 %node_ptr, float %ray_extent, i8 %instance_mask, <3 x float> %ray_origin,
75+
// GFX12: <3 x float> %ray_dir, i32 %offset, <4 x i32> %texture_descr)
76+
// GFX12ISA: image_bvh8_intersect_ray
77+
void test_image_bvh8_intersect_ray(global uint10* ret_vdata, float3* ret_ray_origin,
78+
float3* ret_ray_dir, ulong node_ptr, float ray_extent, uchar instance_mask,
79+
float3 ray_origin, float3 ray_dir, uint offset, uint4 texture_descr)
80+
{
81+
*ret_vdata = __builtin_amdgcn_image_bvh8_intersect_ray(node_ptr, ray_extent,
82+
instance_mask, ray_origin, ray_dir, offset, texture_descr,
83+
ret_ray_origin, ret_ray_dir);
84+
}
85+
#endif
86+
87+
#if __has_builtin(__builtin_amdgcn_image_bvh_dual_intersect_ray)
88+
// GFX12: call { <10 x i32>, <3 x float>, <3 x float> } @llvm.amdgcn.image.bvh.dual.intersect.ray(
89+
// GFX12: i64 %node_ptr, float %ray_extent, i8 %instance_mask, <3 x float> %ray_origin,
90+
// GFX12: <3 x float> %ray_dir, <2 x i32> %offset, <4 x i32> %texture_descr)
91+
// GFX12ISA: image_bvh_dual_intersect_ray
92+
void test_builtin_amdgcn_image_bvh_dual_intersect_ray(global uint10* ret_vdata, float3* ret_ray_origin,
93+
float3* ret_ray_dir, ulong node_ptr, float ray_extent, uchar instance_mask,
94+
float3 ray_origin, float3 ray_dir, uint2 offset, uint4 texture_descr)
95+
{
96+
*ret_vdata = __builtin_amdgcn_image_bvh_dual_intersect_ray(node_ptr, ray_extent,
97+
instance_mask, ray_origin, ray_dir, offset, texture_descr,
98+
ret_ray_origin, ret_ray_dir);
99+
}
100+
#endif
101+
102+
#if __has_builtin(__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn)
103+
// GFX12: call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn(
104+
// GFX12: i32 %addr, i32 %data0, <4 x i32> %data1, i32 0)
105+
// GFX12ISA: ds_bvh_stack_push4_pop1_rtn
106+
void test_builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn(uint* ret_vdst, uint* ret_addr,
107+
uint addr, uint data0, uint4 data1)
108+
{
109+
uint2 ret = __builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn(addr, data0, data1, /*constant offset=*/0);
110+
*ret_vdst = ret.x;
111+
*ret_addr = ret.y;
112+
}
113+
#endif
114+
115+
#if __has_builtin(__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn)
116+
// GFX12: call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn(
117+
// GFX12: i32 %addr, i32 %data0, <8 x i32> %data1, i32 0)
118+
// GFX12ISA: ds_bvh_stack_push8_pop1_rtn
119+
void test_builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn(uint* ret_vdst, uint* ret_addr,
120+
uint addr, uint data0, uint8 data1)
121+
{
122+
uint2 ret = __builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn(addr, data0, data1, /*constant offset=*/0);
123+
*ret_vdst = ret.x;
124+
*ret_addr = ret.y;
125+
}
126+
#endif
127+
128+
#if __has_builtin(__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn)
129+
// GFX12: call { i64, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop2.rtn(
130+
// GFX12: i32 %addr, i32 %data0, <8 x i32> %data1, i32 0)
131+
// GFX12ISA: ds_bvh_stack_push8_pop2_rtn
132+
void test_builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn(ulong* ret_vdst, uint* ret_addr,
133+
uint addr, uint data0, uint8 data1)
134+
{
135+
ulong2 ret = __builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn(addr, data0, data1, /*constant offset=*/0);
136+
*ret_vdst = ret.x;
137+
*ret_addr = ret.y;
138+
}
139+
#endif

llvm/include/llvm/IR/Intrinsics.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -549,6 +549,7 @@ def llvm_v3i32_ty : LLVMType<v3i32>; // 3 x i32
549549
def llvm_v4i32_ty : LLVMType<v4i32>; // 4 x i32
550550
def llvm_v6i32_ty : LLVMType<v6i32>; // 6 x i32
551551
def llvm_v8i32_ty : LLVMType<v8i32>; // 8 x i32
552+
def llvm_v10i32_ty : LLVMType<v10i32>; // 10 x i32
552553
def llvm_v16i32_ty : LLVMType<v16i32>; // 16 x i32
553554
def llvm_v32i32_ty : LLVMType<v32i32>; // 32 x i32
554555
def llvm_v64i32_ty : LLVMType<v64i32>; // 64 x i32
@@ -578,6 +579,7 @@ def llvm_v2f32_ty : LLVMType<v2f32>; // 2 x float
578579
def llvm_v3f32_ty : LLVMType<v3f32>; // 3 x float
579580
def llvm_v4f32_ty : LLVMType<v4f32>; // 4 x float
580581
def llvm_v8f32_ty : LLVMType<v8f32>; // 8 x float
582+
def llvm_v10f32_ty : LLVMType<v10f32>; // 10 x float
581583
def llvm_v16f32_ty : LLVMType<v16f32>; // 16 x float
582584
def llvm_v32f32_ty : LLVMType<v32f32>; // 32 x float
583585
def llvm_v1f64_ty : LLVMType<v1f64>; // 1 x double

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2720,18 +2720,21 @@ def int_amdgcn_ds_sub_gs_reg_rtn :
27202720
[ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree],
27212721
"", [SDNPMemOperand]>;
27222722

2723-
def int_amdgcn_ds_bvh_stack_rtn :
2723+
class IntDSBVHStackRtn<LLVMType vdst, LLVMType data1> :
27242724
Intrinsic<
2725-
[llvm_i32_ty, llvm_i32_ty], // %vdst, %addr
2725+
[vdst, llvm_i32_ty], // %vdst, %addr
27262726
[
27272727
llvm_i32_ty, // %addr
27282728
llvm_i32_ty, // %data0
2729-
llvm_v4i32_ty, // %data1
2729+
data1, // %data1
27302730
llvm_i32_ty, // %offset
27312731
],
27322732
[ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
27332733
>;
27342734

2735+
def int_amdgcn_ds_bvh_stack_rtn : IntDSBVHStackRtn<vdst = llvm_i32_ty,
2736+
data1 = llvm_v4i32_ty>;
2737+
27352738
def int_amdgcn_s_wait_event_export_ready :
27362739
ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">,
27372740
Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]
@@ -2807,6 +2810,37 @@ def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty
28072810
// GFX12 Intrinsics
28082811
//===----------------------------------------------------------------------===//
28092812

2813+
def int_amdgcn_ds_bvh_stack_push4_pop1_rtn : IntDSBVHStackRtn<vdst = llvm_i32_ty,
2814+
data1 = llvm_v4i32_ty>;
2815+
2816+
def int_amdgcn_ds_bvh_stack_push8_pop1_rtn : IntDSBVHStackRtn<vdst = llvm_i32_ty,
2817+
data1 = llvm_v8i32_ty>;
2818+
2819+
def int_amdgcn_ds_bvh_stack_push8_pop2_rtn : IntDSBVHStackRtn<vdst = llvm_i64_ty,
2820+
data1 = llvm_v8i32_ty>;
2821+
2822+
// <vdata>, <ray_origin>, <ray_dir>
2823+
// llvm.amdgcn.image.bvh.dual.intersect.ray <node_ptr>, <ray_extent>,
2824+
// <instance_mask>, <ray_origin>,
2825+
// <ray_dir>, <offsets>,
2826+
// <texture_descr>
2827+
def int_amdgcn_image_bvh_dual_intersect_ray :
2828+
Intrinsic<[llvm_v10i32_ty, llvm_v3f32_ty, llvm_v3f32_ty],
2829+
[llvm_i64_ty, llvm_float_ty, llvm_i8_ty, llvm_v3f32_ty,
2830+
llvm_v3f32_ty, llvm_v2i32_ty, llvm_v4i32_ty],
2831+
[IntrReadMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2832+
2833+
// <vdata>, <ray_origin>, <ray_dir>
2834+
// llvm.amdgcn.image.bvh8.intersect.ray <node_ptr>, <ray_extent>,
2835+
// <instance_mask>, <ray_origin>,
2836+
// <ray_dir>, <offset>,
2837+
// <texture_descr>
2838+
def int_amdgcn_image_bvh8_intersect_ray :
2839+
Intrinsic<[llvm_v10i32_ty, llvm_v3f32_ty, llvm_v3f32_ty],
2840+
[llvm_i64_ty, llvm_float_ty, llvm_i8_ty, llvm_v3f32_ty,
2841+
llvm_v3f32_ty, llvm_i32_ty, llvm_v4i32_ty],
2842+
[IntrReadMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2843+
28102844
// llvm.amdgcn.permlane16.var <old> <src0> <src1> <fi> <bound_control>
28112845
def int_amdgcn_permlane16_var : ClangBuiltin<"__builtin_amdgcn_permlane16_var">,
28122846
Intrinsic<[llvm_i32_ty],

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1088,6 +1088,12 @@ def FeaturePrngInst : SubtargetFeature<"prng-inst",
10881088
"Has v_prng_b32 instruction"
10891089
>;
10901090

1091+
def FeatureBVHDualAndBVH8Insts : SubtargetFeature<"bvh-dual-bvh-8-insts",
1092+
"HasBVHDualAndBVH8Insts",
1093+
"true",
1094+
"Has image_bvh_dual_intersect_ray and image_bvh8_intersect_ray instructions"
1095+
>;
1096+
10911097
//===------------------------------------------------------------===//
10921098
// Subtarget Features (options and debugging)
10931099
//===------------------------------------------------------------===//
@@ -1860,7 +1866,8 @@ def FeatureISAVersion12 : FeatureSet<
18601866
FeatureDPPSrc1SGPR,
18611867
FeatureMaxHardClauseLength32,
18621868
Feature1_5xVGPRs,
1863-
FeatureMemoryAtomicFAddF32DenormalSupport
1869+
FeatureMemoryAtomicFAddF32DenormalSupport,
1870+
FeatureBVHDualAndBVH8Insts
18641871
]>;
18651872

18661873
def FeatureISAVersion12_Generic: FeatureSet<
@@ -2513,6 +2520,9 @@ def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">,
25132520
def HasPrngInst : Predicate<"Subtarget->hasPrngInst()">,
25142521
AssemblerPredicate<(all_of FeaturePrngInst)>;
25152522

2523+
def HasBVHDualAndBVH8Insts : Predicate<"Subtarget->hasBVHDualAndBVH8Insts()">,
2524+
AssemblerPredicate<(all_of FeatureBVHDualAndBVH8Insts)>;
2525+
25162526
def HasFP8ConversionScaleInsts : Predicate<"Subtarget->hasFP8ConversionScaleInsts()">,
25172527
AssemblerPredicate<(all_of FeatureFP8ConversionScaleInsts)>;
25182528

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2665,8 +2665,20 @@ void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
26652665

26662666
// We need to handle this here because tablegen doesn't support matching
26672667
// instructions with multiple outputs.
2668-
void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
2669-
unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2668+
void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) {
2669+
unsigned Opc;
2670+
switch (IntrID) {
2671+
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2672+
case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2673+
Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2674+
break;
2675+
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2676+
Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2677+
break;
2678+
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2679+
Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2680+
break;
2681+
}
26702682
SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
26712683
N->getOperand(5), N->getOperand(0)};
26722684

@@ -2830,7 +2842,10 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
28302842
return;
28312843
}
28322844
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2833-
SelectDSBvhStackIntrinsic(N);
2845+
case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2846+
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2847+
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2848+
SelectDSBvhStackIntrinsic(N, IntrID);
28342849
return;
28352850
case Intrinsic::amdgcn_init_whole_wave:
28362851
CurDAG->getMachineFunction()

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
267267
void SelectFMAD_FMA(SDNode *N);
268268
void SelectFP_EXTEND(SDNode *N);
269269
void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
270-
void SelectDSBvhStackIntrinsic(SDNode *N);
270+
void SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID);
271271
void SelectDS_GWS(SDNode *N, unsigned IntrID);
272272
void SelectInterpP1F16(SDNode *N);
273273
void SelectINTRINSIC_W_CHAIN(SDNode *N);

0 commit comments

Comments
 (0)