Skip to content

Conversation

@rampitec
Copy link
Collaborator

The int_amdgcn_mov_dpp8 is overloaded, but we can only select i32.
To allow a corresponding builtin to be overloaded the same way as
int_amdgcn_mov_dpp we need it to be able to split unsupported values.

The int_amdgcn_mov_dpp8 is overloaded, but we can only select i32.
To allow a corresponding builtin to be overloaded the same way as
int_amdgcn_mov_dpp we need it to be able to split unsupported values.
Copy link
Collaborator Author

This stack of pull requests is managed by Graphite. Learn more about stacking.

Join @rampitec and the rest of your teammates on Graphite Graphite

@llvmbot
Copy link
Member

llvmbot commented Oct 30, 2024

@llvm/pr-subscribers-llvm-ir

@llvm/pr-subscribers-backend-amdgpu

Author: Stanislav Mekhanoshin (rampitec)

Changes

The int_amdgcn_mov_dpp8 is overloaded, but we can only select i32.
To allow a corresponding builtin to be overloaded the same way as
int_amdgcn_mov_dpp we need it to be able to split unsupported values.


Full diff: https://github.com/llvm/llvm-project/pull/114296.diff

5 Files Affected:

  • (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+1-1)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+3)
  • (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+3-1)
  • (modified) llvm/lib/Target/AMDGPU/VOP1Instructions.td (+7-5)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll (+102)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 92226a687cad40..143b538b361c9c 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2461,7 +2461,7 @@ def int_amdgcn_permlanex16 :
 // <sel> is a 32-bit constant whose high 8 bits must be zero which selects
 // the lanes to read from.
 def int_amdgcn_mov_dpp8 :
-  Intrinsic<[llvm_anyint_ty],
+  Intrinsic<[llvm_any_ty],
             [LLVMMatchType<0>, llvm_i32_ty],
             [IntrNoMem, IntrConvergent, IntrWillReturn,
              ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index b35f9faf024bdb..d7126132356d2c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5494,6 +5494,8 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
           .addImm(Src5)
           .getReg(0);
     }
+    case Intrinsic::amdgcn_mov_dpp8:
+      return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
     default:
       llvm_unreachable("unhandled lane op");
     }
@@ -7529,6 +7531,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_permlane64:
   case Intrinsic::amdgcn_set_inactive:
   case Intrinsic::amdgcn_set_inactive_chain_arg:
+  case Intrinsic::amdgcn_mov_dpp8:
     return legalizeLaneOp(Helper, MI, IntrID);
   case Intrinsic::amdgcn_s_buffer_prefetch_data:
     return legalizeSBufferPrefetch(Helper, MI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 52ca38aca5c771..0c61408d478a4e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6181,6 +6181,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
     case Intrinsic::amdgcn_readlane:
     case Intrinsic::amdgcn_set_inactive:
     case Intrinsic::amdgcn_set_inactive_chain_arg:
+    case Intrinsic::amdgcn_mov_dpp8:
       Operands.push_back(Src1);
       [[fallthrough]];
     case Intrinsic::amdgcn_readfirstlane:
@@ -6207,7 +6208,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
   SDValue Src0 = N->getOperand(1);
   SDValue Src1, Src2;
   if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
-      IsSetInactive || IsPermLane16) {
+      IID == Intrinsic::amdgcn_mov_dpp8 || IsSetInactive || IsPermLane16) {
     Src1 = N->getOperand(2);
     if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
       Src2 = N->getOperand(3);
@@ -8833,6 +8834,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_permlane64:
   case Intrinsic::amdgcn_set_inactive:
   case Intrinsic::amdgcn_set_inactive_chain_arg:
+  case Intrinsic::amdgcn_mov_dpp8:
     return lowerLaneOp(*this, Op.getNode(), DAG);
   default:
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 701aeda82c91ed..6b50ed95931765 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -1508,12 +1508,14 @@ defm V_CVT_F32_BF8       : VOP1_Real_NoDstSel_SDWA_gfx9<0x55>;
 defm V_CVT_PK_F32_FP8    : VOP1_Real_NoDstSel_SDWA_gfx9<0x56>;
 defm V_CVT_PK_F32_BF8    : VOP1_Real_NoDstSel_SDWA_gfx9<0x57>;
 
-class MovDPP8Pattern<Predicate Pred, Instruction Inst> : GCNPat <
-  (i32 (int_amdgcn_mov_dpp8 i32:$src, timm:$dpp8)),
+class MovDPP8Pattern<Predicate Pred, Instruction Inst, ValueType vt> : GCNPat <
+  (vt (int_amdgcn_mov_dpp8 vt:$src, timm:$dpp8)),
   (Inst VGPR_32:$src, VGPR_32:$src, (as_i32timm $dpp8), (i32 DPP8Mode.FI_0))> {
   let OtherPredicates = [Pred];
 }
 
-def : MovDPP8Pattern<isGFX10Only, V_MOV_B32_dpp8_gfx10>;
-def : MovDPP8Pattern<isGFX11Only, V_MOV_B32_dpp8_gfx11>;
-def : MovDPP8Pattern<isGFX12Only, V_MOV_B32_dpp8_gfx12>;
+foreach vt = Reg32Types.types in {
+  def : MovDPP8Pattern<isGFX10Only, V_MOV_B32_dpp8_gfx10, vt>;
+  def : MovDPP8Pattern<isGFX11Only, V_MOV_B32_dpp8_gfx11, vt>;
+  def : MovDPP8Pattern<isGFX12Only, V_MOV_B32_dpp8_gfx12, vt>;
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
index 8bff17b7299270..1c5f4b0a26b179 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
@@ -24,6 +24,108 @@ define amdgpu_kernel void @dpp8_wait_states(ptr addrspace(1) %out, i32 %in) {
   ret void
 }
 
+; GFX10PLUS-LABEL: {{^}}dpp8_i64:
+; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
+define amdgpu_ps void @dpp8_i64(i64 %in, ptr addrspace(1) %out) {
+  %tmp0 = call i64 @llvm.amdgcn.mov.dpp8.i64(i64 %in, i32 1)
+  store i64 %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; GFX10PLUS-LABEL: {{^}}dpp8_v2i32:
+; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
+define amdgpu_ps void @dpp8_v2i32(<2 x i32> %in, ptr addrspace(1) %out) {
+  %tmp0 = call <2 x i32> @llvm.amdgcn.mov.dpp8.v3i32(<2 x i32> %in, i32 1)
+  store <2 x i32> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; GFX10PLUS-LABEL: {{^}}dpp8_v3i32:
+; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS-DAG: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off
+define amdgpu_ps void @dpp8_v3i32(<3 x i32> %in, ptr addrspace(1) %out) {
+  %tmp0 = call <3 x i32> @llvm.amdgcn.mov.dpp8.v3i32(<3 x i32> %in, i32 1)
+  store <3 x i32> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; GFX10PLUS-LABEL: {{^}}dpp8_v4i32:
+; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS-DAG: v_mov_b32_dpp v3, v3 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS-DAG: global_store_{{dwordx4|b128}} v[4:5], v[0:3], off
+define amdgpu_ps void @dpp8_v4i32(<4 x i32> %in, ptr addrspace(1) %out) {
+  %tmp0 = call <4 x i32> @llvm.amdgcn.mov.dpp8.v3i32(<4 x i32> %in, i32 1)
+  store <4 x i32> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; GFX10PLUS-LABEL: {{^}}dpp8_p0:
+; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
+define amdgpu_ps void @dpp8_p0(ptr %in, ptr addrspace(1) %out) {
+  %tmp0 = call ptr @llvm.amdgcn.mov.dpp8.p0(ptr %in, i32 1)
+  store ptr %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; GFX10PLUS-LABEL: {{^}}dpp8_p3:
+; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS-DAG: global_store_{{dword|b32}} v[1:2], v0, off
+define amdgpu_ps void @dpp8_p3(ptr addrspace(3) %in, ptr addrspace(1) %out) {
+  %tmp0 = call ptr addrspace(3) @llvm.amdgcn.mov.dpp8.v3p3(ptr addrspace(3) %in, i32 1)
+  store ptr addrspace(3) %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; GFX10PLUS-LABEL: {{^}}dpp8_v3p3:
+; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS-DAG: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off
+define amdgpu_ps void @dpp8_v3p3(<3 x ptr addrspace(3)> %in, ptr addrspace(1) %out) {
+  %tmp0 = call <3 x ptr addrspace(3)> @llvm.amdgcn.mov.dpp8.v3p3(<3 x ptr addrspace(3)> %in, i32 1)
+  store <3 x ptr addrspace(3)> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; GFX10PLUS-LABEL: {{^}}dpp8_i16:
+; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS-DAG: global_store_{{short|b16}} v[1:2], v0, off
+define amdgpu_ps void @dpp8_i16(i16 %in, ptr addrspace(1) %out) {
+  %tmp0 = call i16 @llvm.amdgcn.mov.dpp8.i16(i16 %in, i32 1)
+  store i16 %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; GFX10PLUS-LABEL: {{^}}dpp8_v4i16:
+; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
+define amdgpu_ps void @dpp8_v4i16(<4 x i16> %in, ptr addrspace(1) %out) {
+  %tmp0 = call <4 x i16> @llvm.amdgcn.mov.dpp8.v4i16(<4 x i16> %in, i32 1)
+  store <4 x i16> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; GFX10PLUS-LABEL: {{^}}dpp8_v4f16:
+; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
+define amdgpu_ps void @dpp8_v4f16(<4 x half> %in, ptr addrspace(1) %out) {
+  %tmp0 = call <4 x half> @llvm.amdgcn.mov.dpp8.v4f16(<4 x half> %in, i32 1)
+  store <4 x half> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
 declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #0
 
 attributes #0 = { nounwind readnone convergent }

@rampitec rampitec merged commit 7cd2974 into main Oct 31, 2024
8 checks passed
@rampitec rampitec deleted the users/rampitec/10-30-_amdgpu_extend_mov_dpp8_intrinsic_lowering_for_generic_types branch October 31, 2024 08:15
smallp-o-p pushed a commit to smallp-o-p/llvm-project that referenced this pull request Nov 3, 2024
…14296)

The int_amdgcn_mov_dpp8 is overloaded, but we can only select i32.
To allow a corresponding builtin to be overloaded the same way as
int_amdgcn_mov_dpp we need it to be able to split unsupported values.
NoumanAmir657 pushed a commit to NoumanAmir657/llvm-project that referenced this pull request Nov 4, 2024
…14296)

The int_amdgcn_mov_dpp8 is overloaded, but we can only select i32.
To allow a corresponding builtin to be overloaded the same way as
int_amdgcn_mov_dpp we need it to be able to split unsupported values.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants