diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 86e050333acc7..a0c38c303e638 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2714,18 +2714,21 @@ def int_amdgcn_ds_sub_gs_reg_rtn : [ImmArg>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>; -def int_amdgcn_ds_bvh_stack_rtn : +class IntDSBVHStackRtn : Intrinsic< - [llvm_i32_ty, llvm_i32_ty], // %vdst, %addr + [vdst, llvm_i32_ty], // %vdst, %addr [ llvm_i32_ty, // %addr llvm_i32_ty, // %data0 - llvm_v4i32_ty, // %data1 + data1, // %data1 llvm_i32_ty, // %offset ], [ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree] >; +def int_amdgcn_ds_bvh_stack_rtn : IntDSBVHStackRtn; + def int_amdgcn_s_wait_event_export_ready : ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">, Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn] @@ -2801,6 +2804,15 @@ def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL; + +def int_amdgcn_ds_bvh_stack_push8_pop1_rtn : IntDSBVHStackRtn; + +def int_amdgcn_ds_bvh_stack_push8_pop2_rtn : IntDSBVHStackRtn; + // llvm.amdgcn.permlane16.var def int_amdgcn_permlane16_var : ClangBuiltin<"__builtin_amdgcn_permlane16_var">, Intrinsic<[llvm_i32_ty], diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 8e90754103ff1..e93a401ee20fb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2665,8 +2665,20 @@ void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) { // We need to handle this here because tablegen doesn't support matching // instructions with multiple outputs. -void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) { - unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32; +void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) { + unsigned Opc; + switch (IntrID) { + case Intrinsic::amdgcn_ds_bvh_stack_rtn: + case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn: + Opc = AMDGPU::DS_BVH_STACK_RTN_B32; + break; + case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn: + Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32; + break; + case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: + Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64; + break; + } SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4), N->getOperand(5), N->getOperand(0)}; @@ -2830,7 +2842,10 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) { return; } case Intrinsic::amdgcn_ds_bvh_stack_rtn: - SelectDSBvhStackIntrinsic(N); + case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn: + case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn: + case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: + SelectDSBvhStackIntrinsic(N, IntrID); return; case Intrinsic::amdgcn_init_whole_wave: CurDAG->getMachineFunction() diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 7dcd208a9cdd4..f3b9364fdb92b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -267,7 +267,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { void SelectFMAD_FMA(SDNode *N); void SelectFP_EXTEND(SDNode *N); void SelectDSAppendConsume(SDNode *N, unsigned IntrID); - void SelectDSBvhStackIntrinsic(SDNode *N); + void SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID); void SelectDS_GWS(SDNode *N, unsigned IntrID); void SelectInterpP1F16(SDNode *N); void SelectINTRINSIC_W_CHAIN(SDNode *N); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 2ee82381c4ef0..a222de6a61247 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2277,7 +2277,21 @@ bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic( Register Data1 = MI.getOperand(5).getReg(); unsigned Offset = MI.getOperand(6).getImm(); - auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0) + unsigned Opc; + switch (cast(MI).getIntrinsicID()) { + case Intrinsic::amdgcn_ds_bvh_stack_rtn: + case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn: + Opc = AMDGPU::DS_BVH_STACK_RTN_B32; + break; + case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn: + Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32; + break; + case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: + Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64; + break; + } + + auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0) .addDef(Dst1) .addUse(Addr) .addUse(Data0) @@ -2332,6 +2346,9 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( } break; case Intrinsic::amdgcn_ds_bvh_stack_rtn: + case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn: + case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn: + case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: return selectDSBvhStackIntrinsic(I); case Intrinsic::amdgcn_s_barrier_signal_var: return selectNamedBarrierInit(I, IntrinsicID); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index c19ee14ab1574..4860c32ee0380 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -5252,7 +5252,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); break; - case Intrinsic::amdgcn_ds_bvh_stack_rtn: { + case Intrinsic::amdgcn_ds_bvh_stack_rtn: + case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn: + case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn: + case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: { OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); // %vdst OpdsMapping[1] = diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index d3487daee364f..937f5d55999cb 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -258,10 +258,13 @@ multiclass DS_1A2D_Off8_RET_mc +class DS_BVH_STACK : DS_Pseudo.ret:$vdst, VGPR_32:$addr), - (ins VGPR_32:$addr_in, getLdStRegisterOperand.ret:$data0, VReg_128:$data1, Offset:$offset), + (outs getLdStRegisterOperand.ret:$vdst, VGPR_32:$addr), + (ins VGPR_32:$addr_in, getLdStRegisterOperand.ret:$data0, + data1_rc:$data1, Offset:$offset), " $vdst, $addr, $data0, $data1$offset"> { let Constraints = "$addr = $addr_in"; let DisableEncoding = "$addr_in"; @@ -722,7 +725,8 @@ def DS_SUB_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_sub_gs_reg_rtn", VReg_64, VGPR_32>; let SubtargetPredicate = isGFX11Plus in { let OtherPredicates = [HasImageInsts] in -def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32">; +def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32", + VGPR_32, VReg_128> ; } // let SubtargetPredicate = isGFX11Plus @@ -732,6 +736,13 @@ def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32">; let SubtargetPredicate = isGFX12Plus in { +let OtherPredicates = [HasImageInsts] in { +def DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_BVH_STACK< + "ds_bvh_stack_push8_pop1_rtn_b32", VGPR_32, VReg_256>; +def DS_BVH_STACK_PUSH8_POP2_RTN_B64 : DS_BVH_STACK< + "ds_bvh_stack_push8_pop2_rtn_b64", VReg_64, VReg_256>; +} // End OtherPredicates = [HasImageInsts]. + defm DS_COND_SUB_U32 : DS_1A1D_NORET_mc<"ds_cond_sub_u32">; defm DS_COND_SUB_RTN_U32 : DS_1A1D_RET_mc<"ds_cond_sub_rtn_u32", VGPR_32>; defm DS_SUB_CLAMP_U32 : DS_1A1D_NORET_mc<"ds_sub_clamp_u32">; @@ -1268,6 +1279,11 @@ defm DS_PK_ADD_BF16 : DS_Real_gfx12<0x09b>; defm DS_PK_ADD_RTN_BF16 : DS_Real_gfx12<0x0ab>; defm DS_BPERMUTE_FI_B32 : DS_Real_gfx12<0x0cd>; +defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx12<0x0e0, + "ds_bvh_stack_push4_pop1_rtn_b32", true>; +defm DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_Real_gfx12<0x0e1>; +defm DS_BVH_STACK_PUSH8_POP2_RTN_B64 : DS_Real_gfx12<0x0e2>; + // New aliases added in GFX12 without renaming the instructions. let AssemblerPredicate = isGFX12Plus in { def : AMDGPUMnemonicAlias<"ds_subrev_u32", "ds_rsub_u32">; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 9743320601ed4..250963b3019a0 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1457,7 +1457,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; return true; } - case Intrinsic::amdgcn_ds_bvh_stack_rtn: { + case Intrinsic::amdgcn_ds_bvh_stack_rtn: + case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn: + case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn: + case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: { Info.opc = ISD::INTRINSIC_W_CHAIN; const GCNTargetMachine &TM = diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.push.pop.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.push.pop.rtn.ll new file mode 100644 index 0000000000000..44f5c46954d3b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.push.pop.rtn.ll @@ -0,0 +1,93 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck %s + +define amdgpu_gs void @test_ds_bvh_stack_push4_pop1(i32 %addr, i32 %data0, <4 x i32> %data1) { +; CHECK-LABEL: test_ds_bvh_stack_push4_pop1: +; CHECK: ; %bb.0: +; CHECK-NEXT: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] +; CHECK-NEXT: s_wait_dscnt 0x0 +; CHECK-NEXT: export prim v1, off, off, off done +; CHECK-NEXT: s_endpgm + %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 0) + %vdst = extractvalue { i32, i32 } %pair, 0 + %newaddr = extractvalue { i32, i32 } %pair, 1 + call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 poison, i32 poison, i1 true, i1 false) + ret void +} + +define amdgpu_gs void @test_ds_bvh_stack_push4_pop1_1(i32 %addr, i32 %data0, <4 x i32> %data1) { +; CHECK-LABEL: test_ds_bvh_stack_push4_pop1_1: +; CHECK: ; %bb.0: +; CHECK-NEXT: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] offset:1 +; CHECK-NEXT: s_wait_dscnt 0x0 +; CHECK-NEXT: export prim v1, off, off, off done +; CHECK-NEXT: s_endpgm + %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 1) + %vdst = extractvalue { i32, i32 } %pair, 0 + %newaddr = extractvalue { i32, i32 } %pair, 1 + call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 poison, i32 poison, i1 true, i1 false) + ret void +} + +define amdgpu_gs void @test_ds_bvh_stack_push8_pop1(i32 %addr, i32 %data0, <8 x i32> %data1) { +; CHECK-LABEL: test_ds_bvh_stack_push8_pop1: +; CHECK: ; %bb.0: +; CHECK-NEXT: ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] +; CHECK-NEXT: s_wait_dscnt 0x0 +; CHECK-NEXT: export prim v1, off, off, off done +; CHECK-NEXT: s_endpgm + %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 0) + %vdst = extractvalue { i32, i32 } %pair, 0 + %newaddr = extractvalue { i32, i32 } %pair, 1 + call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 poison, i32 poison, i1 true, i1 false) + ret void +} + +define amdgpu_gs void @test_ds_bvh_stack_push8_pop1_1(i32 %addr, i32 %data0, <8 x i32> %data1) { +; CHECK-LABEL: test_ds_bvh_stack_push8_pop1_1: +; CHECK: ; %bb.0: +; CHECK-NEXT: ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] offset:1 +; CHECK-NEXT: s_wait_dscnt 0x0 +; CHECK-NEXT: export prim v1, off, off, off done +; CHECK-NEXT: s_endpgm + %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 1) + %vdst = extractvalue { i32, i32 } %pair, 0 + %newaddr = extractvalue { i32, i32 } %pair, 1 + call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 poison, i32 poison, i1 true, i1 false) + ret void +} + +define amdgpu_gs void @test_ds_bvh_stack_push8_pop2(i32 %addr, i32 %data0, <8 x i32> %data1, ptr addrspace(1) %out1, ptr addrspace(1) %out2) { +; CHECK-LABEL: test_ds_bvh_stack_push8_pop2: +; CHECK: ; %bb.0: +; CHECK-NEXT: ds_bvh_stack_push8_pop2_rtn_b64 v[1:2], v0, v1, v[2:9] +; CHECK-NEXT: s_wait_dscnt 0x0 +; CHECK-NEXT: export prim v1, off, off, off done +; CHECK-NEXT: s_endpgm + %pair = call { i64, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop2.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 0) + %vdst = extractvalue { i64, i32 } %pair, 0 + %newaddr = extractvalue { i64, i32 } %pair, 1 + %vdst.v2i32 = bitcast i64 %vdst to <2 x i32> + %vdst.lo = extractelement <2 x i32> %vdst.v2i32, i32 0 + %vdst.hi = extractelement <2 x i32> %vdst.v2i32, i32 1 + call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst.lo, i32 %vdst.hi, i32 %newaddr, i32 poison, i1 true, i1 false) + ret void +} + +define amdgpu_gs void @test_ds_bvh_stack_push8_pop2_1(i32 %addr, i32 %data0, <8 x i32> %data1, ptr addrspace(1) %out1, ptr addrspace(1) %out2) { +; CHECK-LABEL: test_ds_bvh_stack_push8_pop2_1: +; CHECK: ; %bb.0: +; CHECK-NEXT: ds_bvh_stack_push8_pop2_rtn_b64 v[1:2], v0, v1, v[2:9] offset:1 +; CHECK-NEXT: s_wait_dscnt 0x0 +; CHECK-NEXT: export prim v1, off, off, off done +; CHECK-NEXT: s_endpgm + %pair = call { i64, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop2.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 1) + %vdst = extractvalue { i64, i32 } %pair, 0 + %newaddr = extractvalue { i64, i32 } %pair, 1 + %vdst.v2i32 = bitcast i64 %vdst to <2 x i32> + %vdst.lo = extractelement <2 x i32> %vdst.v2i32, i32 0 + %vdst.hi = extractelement <2 x i32> %vdst.v2i32, i32 1 + call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst.lo, i32 %vdst.hi, i32 %newaddr, i32 poison, i1 true, i1 false) + ret void +} diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_ds.s b/llvm/test/MC/AMDGPU/gfx12_asm_ds.s index 34c42affdd46c..364463f9404bc 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_ds.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_ds.s @@ -1922,3 +1922,21 @@ ds_bpermute_fi_b32 v5, v1, v2 offset:0 ds_bpermute_fi_b32 v255, v255, v255 offset:4 // GFX12: encoding: [0x04,0x00,0x34,0xdb,0xff,0xff,0x00,0xff] + +ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] +// GFX12: encoding: [0x00,0x00,0x80,0xdb,0x00,0x01,0x02,0x01] + +ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] offset:1 +// GFX12: encoding: [0x01,0x00,0x80,0xdb,0x00,0x01,0x02,0x01] + +ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] +// GFX12: encoding: [0x00,0x00,0x84,0xdb,0x00,0x01,0x02,0x01] + +ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] offset:1 +// GFX12: encoding: [0x01,0x00,0x84,0xdb,0x00,0x01,0x02,0x01] + +ds_bvh_stack_push8_pop2_rtn_b64 v[254:255], v253, v252, v[244:251] +// GFX12: encoding: [0x00,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe] + +ds_bvh_stack_push8_pop2_rtn_b64 v[254:255], v253, v252, v[244:251] offset:127 +// GFX12: encoding: [0x7f,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s b/llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s index c10b96a292178..55e284d4afde9 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s @@ -35,3 +35,6 @@ ds_subrev_u64 v1, v[2:3] ds_subrev_rtn_u64 v[5:6], v1, v[2:3] // GFX12: ds_rsub_rtn_u64 v[5:6], v1, v[2:3] ; encoding: [0x00,0x00,0x88,0xd9,0x01,0x02,0x00,0x05] + +ds_bvh_stack_rtn_b32 v1, v0, v1, v[2:5] +// GFX12: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] ; encoding: [0x00,0x00,0x80,0xdb,0x00,0x01,0x02,0x01] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt index d66748135ffd4..d9381b50ca29f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt @@ -3242,3 +3242,27 @@ # GFX12: ds_bpermute_fi_b32 v255, v255, v255 offset:4 ; encoding: [0x04,0x00,0x34,0xdb,0xff,0xff,0x00,0xff] 0x04,0x00,0x34,0xdb,0xff,0xff,0x00,0xff + +# GFX12: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] ; encoding: [0x00,0x00,0x80,0xdb,0x00,0x01,0x02,0x01] +0x00,0x00,0x80,0xdb,0x00,0x01,0x02,0x01 + +# GFX12: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] offset:1 ; encoding: [0x01,0x00,0x80,0xdb,0x00,0x01,0x02,0x01] +0x01,0x00,0x80,0xdb,0x00,0x01,0x02,0x01 + +# GFX12: ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] ; encoding: [0x00,0x00,0x84,0xdb,0x00,0x01,0x02,0x01] +0x00,0x00,0x84,0xdb,0x00,0x01,0x02,0x01 + +# GFX12: ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] offset:1 ; encoding: [0x01,0x00,0x84,0xdb,0x00,0x01,0x02,0x01] +0x01,0x00,0x84,0xdb,0x00,0x01,0x02,0x01 + +# GFX12: ds_bvh_stack_push8_pop2_rtn_b64 v[254:255], v253, v252, v[244:251] ; encoding: [0x00,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe] +0x00,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe + +# GFX12: ds_bvh_stack_push8_pop2_rtn_b64 v[254:255], v253, v252, v[244:251] offset:127 ; encoding: [0x7f,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe] +0x7f,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe + +# GFX12: ds_bvh_stack_push8_pop2_rtn_b64 v[1:2], v3, v4, v[5:12] offset:127 ; encoding: [0x7f,0x00,0x88,0xdb,0x03,0x04,0x05,0x01] +0x7f,0x00,0x88,0xdb,0x03,0x04,0x05,0x01 + +# GFX12: ds_bvh_stack_push8_pop2_rtn_b64 v[1:2], v3, v4, v[5:12] ; encoding: [0x00,0x00,0x88,0xdb,0x03,0x04,0x05,0x01] +0x00,0x00,0x88,0xdb,0x03,0x04,0x05,0x01