diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp index b22babb4a00d8..5439ea2f59111 100644 --- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -215,6 +215,11 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, bool HasVOP3DPP = ST->hasVOP3DPP(); auto OrigOp = OrigMI.getOpcode(); + if (ST->useRealTrue16Insts() && AMDGPU::isTrue16Inst(OrigOp)) { + LLVM_DEBUG( + dbgs() << " failed: Did not expect any 16-bit uses of dpp values\n"); + return nullptr; + } auto DPPOp = getDPPOp(OrigOp, IsShrinkable); if (DPPOp == -1) { LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n"); diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine-true16.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine-true16.mir new file mode 100644 index 0000000000000..792acda60620e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/dpp_combine-true16.mir @@ -0,0 +1,27 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -mattr=+real-true16 -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN +# RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -run-pass=gcn-dpp-combine -mattr=+real-true16 -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN +# XUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=gcn-dpp-combine -mattr=+real-true16 -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1150 + +# FIXME-TRUE16 add gfx1200 runline when we have those true16 instructions supported + +--- + +# V_MOV_B16_t16_e64_dpp is unsupported to combine +# GCN-label: name: vop3_u16 +# GCN: %4:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec +# GCN: %6:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %5, 0, 1, 15, 15, 1, implicit $exec +name: vop3_u16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + %0:vgpr_16 = COPY $vgpr0 + %1:vgpr_16 = COPY $vgpr1 + %2:vgpr_16 = COPY $vgpr2 + %3:vgpr_16 = IMPLICIT_DEF + %4:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec + %5:vgpr_16 = V_ADD_NC_U16_t16_e64 0, %4, 0, %3, 0, 0, implicit $exec + %6:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %5, 0, 1, 15, 15, 1, implicit $exec + %7:vgpr_16 = V_ADD_NC_U16_t16_e64 4, %6, 8, %5, 0, 0, implicit $exec +... diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine.ll b/llvm/test/CodeGen/AMDGPU/dpp_combine.ll index 5162092f78aca..926c2a3f12aab 100644 --- a/llvm/test/CodeGen/AMDGPU/dpp_combine.ll +++ b/llvm/test/CodeGen/AMDGPU/dpp_combine.ll @@ -1,7 +1,9 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN -; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX9GFX10 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX9GFX10 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX11-TRUE16 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX11-FAKE16 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX11-TRUE16 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX11-FAKE16 ; GCN-LABEL: {{^}}dpp_add: ; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]], @@ -63,6 +65,30 @@ define amdgpu_kernel void @dpp_mul(ptr addrspace(1) %arg) { ret void } +; It is not expected to see a sequence of v_mov_b32_dpp feeding into a 16 bit instruction +; GCN-LABEL: {{^}}dpp_fadd_f16: +; GFX9GFX10: global_load_{{dword|b32}} [[V:v[0-9]+]], +; GFX9GFX10: v_add_f16_dpp [[V]], [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} +; GFX11-TRUE16: v_mov_b32_dpp {{v[0-9]+}}, {{v[0-9]+}} quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11-TRUE16: v_add_f16_e32 +; GFX11-FAKE16: global_load_{{dword|b32}} [[V:v[0-9]+]], +; GFX11-FAKE16: v_add_f16_e64_dpp [[V]], [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +define amdgpu_kernel void @dpp_fadd_f16(ptr addrspace(1) %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %id + %load = load i32, ptr addrspace(1) %gep + %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %load, i32 %load, i32 1, i32 15, i32 15, i1 1) #0 + %tmp01 = trunc i32 %tmp0 to i16 + %tmp1 = bitcast i16 %tmp01 to half + %tt = trunc i32 %load to i16 + %t = bitcast i16 %tt to half + %add = fadd half %tmp1, %t + %tmp2 = bitcast half %add to i16 + %tmp3 = zext i16 %tmp2 to i32 + store i32 %tmp3, ptr addrspace(1) %gep + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0 declare float @llvm.ceil.f32(float) diff --git a/llvm/test/CodeGen/AMDGPU/vopc_dpp-true16.mir b/llvm/test/CodeGen/AMDGPU/vopc_dpp-true16.mir new file mode 100644 index 0000000000000..8f63f6c8cb1c6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vopc_dpp-true16.mir @@ -0,0 +1,124 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN + +--- + +name: vopc +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GCN-LABEL: name: vopc + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: V_CMP_LT_F32_e32_dpp 0, [[COPY1]], 0, [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec + ; GCN-NEXT: V_CMPX_GT_U32_nosdst_e64 [[V_MOV_B32_dpp]], [[COPY]], implicit-def $exec, implicit $mode, implicit $exec + ; GCN-NEXT: V_CMP_CLASS_F32_e32_dpp 2, [[COPY1]], [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $exec + ; GCN-NEXT: V_CMP_NGE_F32_e32_dpp 0, [[COPY1]], 0, [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_CMP_NGE_F32_e64_dpp:%[0-9]+]]:sgpr_32 = V_CMP_NGE_F32_e64_dpp 0, [[COPY1]], 0, [[COPY]], 0, 1, 15, 15, 1, implicit $mode, implicit $exec + ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 [[V_CMP_NGE_F32_e64_dpp]], 10101, implicit-def $scc + ; GCN-NEXT: V_CMP_GT_I32_e32_dpp [[COPY1]], [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $exec + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + %3:vgpr_32 = IMPLICIT_DEF + + %4:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + V_CMP_LT_F32_e32 %4, %0, implicit-def $vcc, implicit $mode, implicit $exec + + %10:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + V_CMPX_GT_U32_nosdst_e64 %10, %0, implicit-def $exec, implicit $mode, implicit $exec + + %11:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + %12:sgpr_32 = V_CMP_CLASS_F32_e64 2, %11, %0, implicit $mode, implicit $exec + + %13:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + %14:sgpr_32 = V_CMP_NGE_F32_e64 0, %13, 0, %0, 0, implicit $mode, implicit $exec + + %17:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + %18:sgpr_32 = V_CMP_NGE_F32_e64 0, %17, 0, %0, 0, implicit $mode, implicit $exec + %19:sgpr_32 = S_AND_B32 %18, 10101, implicit-def $scc + + %20:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + V_CMP_LT_I32_e32 %0, %20, implicit-def $vcc, implicit $exec + +... +--- + +# V_MOV_B16_t16_e64_dpp is unsupported to combine +name: vopc_16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_lo16, $vgpr1_hi16, $vgpr255_hi16 + + ; GCN-LABEL: name: vopc_16 + ; GCN: liveins: $vgpr0_lo16, $vgpr1_hi16, $vgpr255_hi16 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_16 = COPY $vgpr0_lo16 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY $vgpr1_hi16 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY $vgpr255_hi16 + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GCN-NEXT: [[V_MOV_B16_t16_e64_dpp:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64_dpp [[DEF]], 0, [[COPY1]], 0, 1, 15, 15, 1, implicit $exec + ; GCN-NEXT: V_CMPX_EQ_I16_t16_nosdst_e64 0, [[V_MOV_B16_t16_e64_dpp]], 0, [[COPY]], 0, implicit-def $exec, implicit-def $vcc_lo, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MOV_B16_t16_e64_dpp1:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64_dpp [[DEF]], 0, [[COPY1]], 0, 1, 15, 15, 1, implicit $exec + ; GCN-NEXT: [[V_CMP_CLASS_F16_t16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_CLASS_F16_t16_e64 0, [[V_MOV_B16_t16_e64_dpp1]], 0, [[COPY]], 0, implicit-def $vcc_lo, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MOV_B16_t16_e64_dpp2:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64_dpp [[DEF]], 0, [[COPY1]], 0, 1, 15, 15, 1, implicit $exec + ; GCN-NEXT: [[V_CMP_GE_F16_t16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_GE_F16_t16_e64 1, [[V_MOV_B16_t16_e64_dpp2]], 0, [[COPY]], 1, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MOV_B16_t16_e64_dpp3:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64_dpp [[DEF]], 0, [[COPY1]], 0, 1, 15, 15, 1, implicit $exec + ; GCN-NEXT: [[V_CMP_NGE_F16_t16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_NGE_F16_t16_e64 0, [[V_CMP_NGE_F16_t16_e64_]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + %0:vgpr_16 = COPY $vgpr0_lo16 + %1:vgpr_16 = COPY $vgpr1_hi16 + %2:vgpr_16 = COPY $vgpr255_hi16 + %3:vgpr_16 = IMPLICIT_DEF + + %5:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec + V_CMPX_EQ_I16_t16_nosdst_e64 0, %5, 0, %0, 0, implicit-def $exec, implicit-def $vcc, implicit $mode, implicit $exec + + %6:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec + %7:sgpr_32 = V_CMP_CLASS_F16_t16_e64 0, %6, 0, %0, 0, implicit-def $vcc, implicit $mode, implicit $exec + + %8:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec + %9:sgpr_32 = V_CMP_GE_F16_t16_e64 1, %8, 0, %0, 1, 0, implicit $mode, implicit $exec + + %15:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec + %16:sgpr_32 = V_CMP_NGE_F16_t16_e64 0, %16, 0, %0, 0, 0, implicit $mode, implicit $exec + +... +--- + +name: mask_not_full +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GCN-LABEL: name: mask_not_full + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B16_t16_e64_dpp:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64_dpp [[DEF]].lo16, 0, [[COPY1]].hi16, 0, 1, 15, 14, 1, implicit $exec + ; GCN-NEXT: [[V_CMP_CLASS_F16_t16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_CLASS_F16_t16_e64 0, [[V_MOV_B16_t16_e64_dpp]], 0, [[COPY]].lo16, 0, implicit-def $vcc_lo, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[COPY1]], 1, 13, 15, 1, implicit $exec + ; GCN-NEXT: [[V_CMP_GE_F32_e64_:%[0-9]+]]:sgpr_32 = V_CMP_GE_F32_e64 1, [[V_MOV_B32_dpp]], 0, [[COPY]], 1, implicit $mode, implicit $exec + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + + %4:vgpr_16 = V_MOV_B16_t16_e64_dpp %2.lo16, 0, %1.hi16, 0, 1, 15, 14, 1, implicit $exec + %99:sgpr_32 = V_CMP_CLASS_F16_t16_e64 0, %4, 0, %0.lo16, 0, implicit-def $vcc, implicit $mode, implicit $exec + + %5:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 13, 15, 1, implicit $exec + %6:sgpr_32 = V_CMP_GE_F32_e64 1, %5, 0, %0, 1, implicit $mode, implicit $exec + +...