Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2461,7 +2461,7 @@ def int_amdgcn_permlanex16 :
// <sel> is a 32-bit constant whose high 8 bits must be zero which selects
// the lanes to read from.
def int_amdgcn_mov_dpp8 :
Intrinsic<[llvm_anyint_ty],
Intrinsic<[llvm_any_ty],
[LLVMMatchType<0>, llvm_i32_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn,
ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree]>;
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5494,6 +5494,8 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
.addImm(Src5)
.getReg(0);
}
case Intrinsic::amdgcn_mov_dpp8:
return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
default:
llvm_unreachable("unhandled lane op");
}
Expand Down Expand Up @@ -7529,6 +7531,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_set_inactive:
case Intrinsic::amdgcn_set_inactive_chain_arg:
case Intrinsic::amdgcn_mov_dpp8:
return legalizeLaneOp(Helper, MI, IntrID);
case Intrinsic::amdgcn_s_buffer_prefetch_data:
return legalizeSBufferPrefetch(Helper, MI);
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6181,6 +6181,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
case Intrinsic::amdgcn_readlane:
case Intrinsic::amdgcn_set_inactive:
case Intrinsic::amdgcn_set_inactive_chain_arg:
case Intrinsic::amdgcn_mov_dpp8:
Operands.push_back(Src1);
[[fallthrough]];
case Intrinsic::amdgcn_readfirstlane:
Expand All @@ -6207,7 +6208,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
SDValue Src0 = N->getOperand(1);
SDValue Src1, Src2;
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
IsSetInactive || IsPermLane16) {
IID == Intrinsic::amdgcn_mov_dpp8 || IsSetInactive || IsPermLane16) {
Src1 = N->getOperand(2);
if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
Src2 = N->getOperand(3);
Expand Down Expand Up @@ -8833,6 +8834,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_set_inactive:
case Intrinsic::amdgcn_set_inactive_chain_arg:
case Intrinsic::amdgcn_mov_dpp8:
return lowerLaneOp(*this, Op.getNode(), DAG);
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
Expand Down
12 changes: 7 additions & 5 deletions llvm/lib/Target/AMDGPU/VOP1Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1508,12 +1508,14 @@ defm V_CVT_F32_BF8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x55>;
defm V_CVT_PK_F32_FP8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x56>;
defm V_CVT_PK_F32_BF8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x57>;

class MovDPP8Pattern<Predicate Pred, Instruction Inst> : GCNPat <
(i32 (int_amdgcn_mov_dpp8 i32:$src, timm:$dpp8)),
class MovDPP8Pattern<Predicate Pred, Instruction Inst, ValueType vt> : GCNPat <
(vt (int_amdgcn_mov_dpp8 vt:$src, timm:$dpp8)),
(Inst VGPR_32:$src, VGPR_32:$src, (as_i32timm $dpp8), (i32 DPP8Mode.FI_0))> {
let OtherPredicates = [Pred];
}

def : MovDPP8Pattern<isGFX10Only, V_MOV_B32_dpp8_gfx10>;
def : MovDPP8Pattern<isGFX11Only, V_MOV_B32_dpp8_gfx11>;
def : MovDPP8Pattern<isGFX12Only, V_MOV_B32_dpp8_gfx12>;
foreach vt = Reg32Types.types in {
def : MovDPP8Pattern<isGFX10Only, V_MOV_B32_dpp8_gfx10, vt>;
def : MovDPP8Pattern<isGFX11Only, V_MOV_B32_dpp8_gfx11, vt>;
def : MovDPP8Pattern<isGFX12Only, V_MOV_B32_dpp8_gfx12, vt>;
}
160 changes: 160 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,166 @@ define amdgpu_kernel void @dpp8_wait_states(ptr addrspace(1) %out, i32 %in) {
ret void
}

; GFX10PLUS-LABEL: {{^}}dpp8_i64:
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
define amdgpu_ps void @dpp8_i64(i64 %in, ptr addrspace(1) %out) {
%tmp0 = call i64 @llvm.amdgcn.mov.dpp8.i64(i64 %in, i32 1)
store i64 %tmp0, ptr addrspace(1) %out
ret void
}

; GFX10PLUS-LABEL: {{^}}dpp8_v2i32:
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
define amdgpu_ps void @dpp8_v2i32(<2 x i32> %in, ptr addrspace(1) %out) {
%tmp0 = call <2 x i32> @llvm.amdgcn.mov.dpp8.v3i32(<2 x i32> %in, i32 1)
store <2 x i32> %tmp0, ptr addrspace(1) %out
ret void
}

; GFX10PLUS-LABEL: {{^}}dpp8_v3i32:
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off
define amdgpu_ps void @dpp8_v3i32(<3 x i32> %in, ptr addrspace(1) %out) {
%tmp0 = call <3 x i32> @llvm.amdgcn.mov.dpp8.v3i32(<3 x i32> %in, i32 1)
store <3 x i32> %tmp0, ptr addrspace(1) %out
ret void
}

; GFX10PLUS-LABEL: {{^}}dpp8_v4i32:
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: v_mov_b32_dpp v3, v3 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: global_store_{{dwordx4|b128}} v[4:5], v[0:3], off
define amdgpu_ps void @dpp8_v4i32(<4 x i32> %in, ptr addrspace(1) %out) {
%tmp0 = call <4 x i32> @llvm.amdgcn.mov.dpp8.v3i32(<4 x i32> %in, i32 1)
store <4 x i32> %tmp0, ptr addrspace(1) %out
ret void
}

; GFX10PLUS-LABEL: {{^}}dpp8_p0:
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
define amdgpu_ps void @dpp8_p0(ptr %in, ptr addrspace(1) %out) {
%tmp0 = call ptr @llvm.amdgcn.mov.dpp8.p0(ptr %in, i32 1)
store ptr %tmp0, ptr addrspace(1) %out
ret void
}

; GFX10PLUS-LABEL: {{^}}dpp8_p3:
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: global_store_{{dword|b32}} v[1:2], v0, off
define amdgpu_ps void @dpp8_p3(ptr addrspace(3) %in, ptr addrspace(1) %out) {
%tmp0 = call ptr addrspace(3) @llvm.amdgcn.mov.dpp8.v3p3(ptr addrspace(3) %in, i32 1)
store ptr addrspace(3) %tmp0, ptr addrspace(1) %out
ret void
}

; GFX10PLUS-LABEL: {{^}}dpp8_v3p3:
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off
define amdgpu_ps void @dpp8_v3p3(<3 x ptr addrspace(3)> %in, ptr addrspace(1) %out) {
%tmp0 = call <3 x ptr addrspace(3)> @llvm.amdgcn.mov.dpp8.v3p3(<3 x ptr addrspace(3)> %in, i32 1)
store <3 x ptr addrspace(3)> %tmp0, ptr addrspace(1) %out
ret void
}

; GFX10PLUS-LABEL: {{^}}dpp8_i16:
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: global_store_{{short|b16}} v[1:2], v0, off
define amdgpu_ps void @dpp8_i16(i16 %in, ptr addrspace(1) %out) {
%tmp0 = call i16 @llvm.amdgcn.mov.dpp8.i16(i16 %in, i32 1)
store i16 %tmp0, ptr addrspace(1) %out
ret void
}

; GFX10PLUS-LABEL: {{^}}dpp8_v4i16:
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
define amdgpu_ps void @dpp8_v4i16(<4 x i16> %in, ptr addrspace(1) %out) {
%tmp0 = call <4 x i16> @llvm.amdgcn.mov.dpp8.v4i16(<4 x i16> %in, i32 1)
store <4 x i16> %tmp0, ptr addrspace(1) %out
ret void
}

; GFX10PLUS-LABEL: {{^}}dpp8_v4f16:
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
define amdgpu_ps void @dpp8_v4f16(<4 x half> %in, ptr addrspace(1) %out) {
%tmp0 = call <4 x half> @llvm.amdgcn.mov.dpp8.v4f16(<4 x half> %in, i32 1)
store <4 x half> %tmp0, ptr addrspace(1) %out
ret void
}

; GFX10PLUS-LABEL: {{^}}dpp8_float:
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: global_store_{{dword|b32}} v[1:2], v0, off
define amdgpu_ps void @dpp8_float(float %in, ptr addrspace(1) %out) {
%tmp0 = call float @llvm.amdgcn.mov.dpp8.f32(float %in, i32 1)
store float %tmp0, ptr addrspace(1) %out
ret void
}

; GFX10PLUS-LABEL: {{^}}dpp8_v3f32:
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off
define amdgpu_ps void @dpp8_v3f32(<3 x float> %in, ptr addrspace(1) %out) {
%tmp0 = call <3 x float> @llvm.amdgcn.mov.dpp8.v3f32(<3 x float> %in, i32 1)
store <3 x float> %tmp0, ptr addrspace(1) %out
ret void
}

; GFX10PLUS-LABEL: {{^}}dpp8_half:
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: global_store_{{short|b16}} v[1:2], v0, off
define amdgpu_ps void @dpp8_half(half %in, ptr addrspace(1) %out) {
%tmp0 = call half @llvm.amdgcn.mov.dpp8.f16(half %in, i32 1)
store half %tmp0, ptr addrspace(1) %out
ret void
}

; GFX10PLUS-LABEL: {{^}}dpp8_bfloat:
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: global_store_{{short|b16}} v[1:2], v0, off
define amdgpu_ps void @dpp8_bfloat(bfloat %in, ptr addrspace(1) %out) {
%tmp0 = call bfloat @llvm.amdgcn.mov.dpp8.bf16(bfloat %in, i32 1)
store bfloat %tmp0, ptr addrspace(1) %out
ret void
}

; GFX10PLUS-LABEL: {{^}}dpp8_v4bf16:
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
define amdgpu_ps void @dpp8_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) {
%tmp0 = call <4 x bfloat> @llvm.amdgcn.mov.dpp8.v4bf16(<4 x bfloat> %in, i32 1)
store <4 x bfloat> %tmp0, ptr addrspace(1) %out
ret void
}

; GFX10PLUS-LABEL: {{^}}dpp8_double:
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
define amdgpu_ps void @dpp8_double(double %in, ptr addrspace(1) %out) {
%tmp0 = call double @llvm.amdgcn.mov.dpp8.f64(double %in, i32 1)
store double %tmp0, ptr addrspace(1) %out
ret void
}

declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #0

attributes #0 = { nounwind readnone convergent }
Loading