diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 92226a687cad4..143b538b361c9 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2461,7 +2461,7 @@ def int_amdgcn_permlanex16 : // is a 32-bit constant whose high 8 bits must be zero which selects // the lanes to read from. def int_amdgcn_mov_dpp8 : - Intrinsic<[llvm_anyint_ty], + Intrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg>, IntrNoCallback, IntrNoFree]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index b35f9faf024bd..d7126132356d2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5494,6 +5494,8 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, .addImm(Src5) .getReg(0); } + case Intrinsic::amdgcn_mov_dpp8: + return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0); default: llvm_unreachable("unhandled lane op"); } @@ -7529,6 +7531,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_permlane64: case Intrinsic::amdgcn_set_inactive: case Intrinsic::amdgcn_set_inactive_chain_arg: + case Intrinsic::amdgcn_mov_dpp8: return legalizeLaneOp(Helper, MI, IntrID); case Intrinsic::amdgcn_s_buffer_prefetch_data: return legalizeSBufferPrefetch(Helper, MI); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 52ca38aca5c77..0c61408d478a4 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6181,6 +6181,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, case Intrinsic::amdgcn_readlane: case Intrinsic::amdgcn_set_inactive: case Intrinsic::amdgcn_set_inactive_chain_arg: + case Intrinsic::amdgcn_mov_dpp8: Operands.push_back(Src1); [[fallthrough]]; case Intrinsic::amdgcn_readfirstlane: @@ -6207,7 +6208,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SDValue Src0 = N->getOperand(1); SDValue Src1, Src2; if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane || - IsSetInactive || IsPermLane16) { + IID == Intrinsic::amdgcn_mov_dpp8 || IsSetInactive || IsPermLane16) { Src1 = N->getOperand(2); if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) Src2 = N->getOperand(3); @@ -8833,6 +8834,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_permlane64: case Intrinsic::amdgcn_set_inactive: case Intrinsic::amdgcn_set_inactive_chain_arg: + case Intrinsic::amdgcn_mov_dpp8: return lowerLaneOp(*this, Op.getNode(), DAG); default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 701aeda82c91e..6b50ed9593176 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1508,12 +1508,14 @@ defm V_CVT_F32_BF8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x55>; defm V_CVT_PK_F32_FP8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x56>; defm V_CVT_PK_F32_BF8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x57>; -class MovDPP8Pattern : GCNPat < - (i32 (int_amdgcn_mov_dpp8 i32:$src, timm:$dpp8)), +class MovDPP8Pattern : GCNPat < + (vt (int_amdgcn_mov_dpp8 vt:$src, timm:$dpp8)), (Inst VGPR_32:$src, VGPR_32:$src, (as_i32timm $dpp8), (i32 DPP8Mode.FI_0))> { let OtherPredicates = [Pred]; } -def : MovDPP8Pattern; -def : MovDPP8Pattern; -def : MovDPP8Pattern; +foreach vt = Reg32Types.types in { + def : MovDPP8Pattern; + def : MovDPP8Pattern; + def : MovDPP8Pattern; +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll index 8bff17b729927..049cc455ab01c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll @@ -24,6 +24,166 @@ define amdgpu_kernel void @dpp8_wait_states(ptr addrspace(1) %out, i32 %in) { ret void } +; GFX10PLUS-LABEL: {{^}}dpp8_i64: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off +define amdgpu_ps void @dpp8_i64(i64 %in, ptr addrspace(1) %out) { + %tmp0 = call i64 @llvm.amdgcn.mov.dpp8.i64(i64 %in, i32 1) + store i64 %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_v2i32: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off +define amdgpu_ps void @dpp8_v2i32(<2 x i32> %in, ptr addrspace(1) %out) { + %tmp0 = call <2 x i32> @llvm.amdgcn.mov.dpp8.v3i32(<2 x i32> %in, i32 1) + store <2 x i32> %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_v3i32: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off +define amdgpu_ps void @dpp8_v3i32(<3 x i32> %in, ptr addrspace(1) %out) { + %tmp0 = call <3 x i32> @llvm.amdgcn.mov.dpp8.v3i32(<3 x i32> %in, i32 1) + store <3 x i32> %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_v4i32: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v3, v3 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{dwordx4|b128}} v[4:5], v[0:3], off +define amdgpu_ps void @dpp8_v4i32(<4 x i32> %in, ptr addrspace(1) %out) { + %tmp0 = call <4 x i32> @llvm.amdgcn.mov.dpp8.v3i32(<4 x i32> %in, i32 1) + store <4 x i32> %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_p0: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off +define amdgpu_ps void @dpp8_p0(ptr %in, ptr addrspace(1) %out) { + %tmp0 = call ptr @llvm.amdgcn.mov.dpp8.p0(ptr %in, i32 1) + store ptr %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_p3: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{dword|b32}} v[1:2], v0, off +define amdgpu_ps void @dpp8_p3(ptr addrspace(3) %in, ptr addrspace(1) %out) { + %tmp0 = call ptr addrspace(3) @llvm.amdgcn.mov.dpp8.v3p3(ptr addrspace(3) %in, i32 1) + store ptr addrspace(3) %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_v3p3: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off +define amdgpu_ps void @dpp8_v3p3(<3 x ptr addrspace(3)> %in, ptr addrspace(1) %out) { + %tmp0 = call <3 x ptr addrspace(3)> @llvm.amdgcn.mov.dpp8.v3p3(<3 x ptr addrspace(3)> %in, i32 1) + store <3 x ptr addrspace(3)> %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_i16: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{short|b16}} v[1:2], v0, off +define amdgpu_ps void @dpp8_i16(i16 %in, ptr addrspace(1) %out) { + %tmp0 = call i16 @llvm.amdgcn.mov.dpp8.i16(i16 %in, i32 1) + store i16 %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_v4i16: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off +define amdgpu_ps void @dpp8_v4i16(<4 x i16> %in, ptr addrspace(1) %out) { + %tmp0 = call <4 x i16> @llvm.amdgcn.mov.dpp8.v4i16(<4 x i16> %in, i32 1) + store <4 x i16> %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_v4f16: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off +define amdgpu_ps void @dpp8_v4f16(<4 x half> %in, ptr addrspace(1) %out) { + %tmp0 = call <4 x half> @llvm.amdgcn.mov.dpp8.v4f16(<4 x half> %in, i32 1) + store <4 x half> %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_float: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{dword|b32}} v[1:2], v0, off +define amdgpu_ps void @dpp8_float(float %in, ptr addrspace(1) %out) { + %tmp0 = call float @llvm.amdgcn.mov.dpp8.f32(float %in, i32 1) + store float %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_v3f32: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off +define amdgpu_ps void @dpp8_v3f32(<3 x float> %in, ptr addrspace(1) %out) { + %tmp0 = call <3 x float> @llvm.amdgcn.mov.dpp8.v3f32(<3 x float> %in, i32 1) + store <3 x float> %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_half: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{short|b16}} v[1:2], v0, off +define amdgpu_ps void @dpp8_half(half %in, ptr addrspace(1) %out) { + %tmp0 = call half @llvm.amdgcn.mov.dpp8.f16(half %in, i32 1) + store half %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_bfloat: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{short|b16}} v[1:2], v0, off +define amdgpu_ps void @dpp8_bfloat(bfloat %in, ptr addrspace(1) %out) { + %tmp0 = call bfloat @llvm.amdgcn.mov.dpp8.bf16(bfloat %in, i32 1) + store bfloat %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_v4bf16: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off +define amdgpu_ps void @dpp8_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) { + %tmp0 = call <4 x bfloat> @llvm.amdgcn.mov.dpp8.v4bf16(<4 x bfloat> %in, i32 1) + store <4 x bfloat> %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_double: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off +define amdgpu_ps void @dpp8_double(double %in, ptr addrspace(1) %out) { + %tmp0 = call double @llvm.amdgcn.mov.dpp8.f64(double %in, i32 1) + store double %tmp0, ptr addrspace(1) %out + ret void +} + declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #0 attributes #0 = { nounwind readnone convergent }