diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 15bdd9ae293a1..34a89a907e648 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -5312,26 +5312,20 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root, // Only change Src if src modifier could be gained. In such cases new Src // could be sgpr but this does not violate constant bus restriction for // instruction that is being selected. - // Note: Src is not changed when there is only a simple sgpr to vgpr copy - // since this could violate constant bus restriction. - Register PeekSrc = stripCopy(Src, *MRI); + Src = stripBitCast(Src, *MRI); const auto CheckAbsNeg = [&]() { // Be careful about folding modifiers if we already have an abs. fneg is // applied last, so we don't want to apply an earlier fneg. if ((Mods & SISrcMods::ABS) == 0) { unsigned ModsTmp; - std::tie(PeekSrc, ModsTmp) = selectVOP3ModsImpl(PeekSrc); + std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src); - if ((ModsTmp & SISrcMods::NEG) != 0) { + if ((ModsTmp & SISrcMods::NEG) != 0) Mods ^= SISrcMods::NEG; - Src = PeekSrc; - } - if ((ModsTmp & SISrcMods::ABS) != 0) { + if ((ModsTmp & SISrcMods::ABS) != 0) Mods |= SISrcMods::ABS; - Src = PeekSrc; - } } }; @@ -5344,8 +5338,7 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root, Mods |= SISrcMods::OP_SEL_1; - if (isExtractHiElt(*MRI, PeekSrc, PeekSrc)) { - Src = PeekSrc; + if (isExtractHiElt(*MRI, Src, Src)) { Mods |= SISrcMods::OP_SEL_0; CheckAbsNeg(); } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll index 4ebe1c499a176..4d603f7487754 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll @@ -73,10 +73,14 @@ define amdgpu_vs <5 x float> @test_5xf16_5xf32_add_ext_mul(<5 x half> inreg %x, ; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v3, s9 ; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v4, s10 +; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s11, s0, 16 +; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s12, s1, 16 +; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s13, s3, 16 +; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s14, s4, 16 ; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v0, s0, s3, v0 op_sel_hi:[1,1,0] -; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s0, s3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s11, s13, v1 op_sel_hi:[1,1,0] ; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v2, s1, s4, v2 op_sel_hi:[1,1,0] -; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s1, s4, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s12, s14, v3 op_sel_hi:[1,1,0] ; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v4, s2, s5, v4 op_sel_hi:[1,1,0] ; GFX10-FAST-DENORM-NEXT: ; return to shader part epilog .entry: @@ -117,12 +121,18 @@ define amdgpu_vs <6 x float> @test_6xf16_6xf32_add_ext_mul_rhs(<6 x half> inreg ; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v3, s9 ; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v4, s10 ; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v5, s11 +; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s12, s0, 16 +; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s13, s1, 16 +; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s6, s2, 16 +; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s14, s3, 16 +; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s15, s4, 16 +; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s16, s5, 16 ; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v0, s0, s3, v0 op_sel_hi:[1,1,0] -; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s0, s3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s12, s14, v1 op_sel_hi:[1,1,0] ; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v2, s1, s4, v2 op_sel_hi:[1,1,0] -; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s1, s4, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s13, s15, v3 op_sel_hi:[1,1,0] ; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v4, s2, s5, v4 op_sel_hi:[1,1,0] -; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v5, s2, s5, v5 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v5, s6, s16, v5 op_sel_hi:[1,1,0] ; GFX10-FAST-DENORM-NEXT: ; return to shader part epilog .entry: %a = fmul fast <6 x half> %x, %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll index 89cd18ad9be70..1a98285230b2c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -2555,9 +2555,9 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) { ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 ; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0] ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v2, s0 -; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, s2, v2 +; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, s3 ; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-FLUSH-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-FLUSH-NEXT: ; return to shader part epilog @@ -2571,7 +2571,7 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) { ; GFX10-NEXT: v_rcp_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_f32_e32 v1, v1 ; GFX10-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0] -; GFX10-NEXT: v_fma_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-NEXT: v_fma_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0] ; GFX10-NEXT: v_div_fixup_f16 v0, v0, s1, s0 ; GFX10-NEXT: v_div_fixup_f16 v1, v1, s2, s3 ; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 @@ -2588,7 +2588,7 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) { ; GFX11-NEXT: v_rcp_f32_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0] ; GFX11-NEXT: v_div_fixup_f16 v0, v0, s1, s0 ; GFX11-NEXT: v_div_fixup_f16 v1, v1, s2, s3 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.ll new file mode 100644 index 0000000000000..b7150a224db89 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.ll @@ -0,0 +1,106 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx908 %s -o - | FileCheck %s + +define float @test_fmamix_constant_bus_violation_sss(i32 inreg %val.0, i32 inreg %val.1, i32 inreg %val.2) #0 { +; CHECK-LABEL: test_fmamix_constant_bus_violation_sss: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_lshr_b32 s4, s6, 16 +; CHECK-NEXT: s_lshr_b32 s5, s7, 16 +; CHECK-NEXT: s_lshr_b32 s6, s16, 16 +; CHECK-NEXT: v_mov_b32_e32 v0, s5 +; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: v_fma_mix_f32 v0, s4, v0, v1 op_sel_hi:[1,1,1] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %lshr.0 = lshr i32 %val.0, 16 + %lshr.1 = lshr i32 %val.1, 16 + %lshr.2 = lshr i32 %val.2, 16 + %trunc.0 = trunc i32 %lshr.0 to i16 + %trunc.1 = trunc i32 %lshr.1 to i16 + %trunc.2 = trunc i32 %lshr.2 to i16 + %cast.0 = bitcast i16 %trunc.0 to half + %cast.1 = bitcast i16 %trunc.1 to half + %cast.2 = bitcast i16 %trunc.2 to half + %fpext.0 = fpext half %cast.0 to float + %fpext.1 = fpext half %cast.1 to float + %fpext.2 = fpext half %cast.2 to float + %fma = call float @llvm.fma.f32(float %fpext.0, float %fpext.1, float %fpext.2) + ret float %fma +} + +define float @test_fmamix_constant_bus_violation_ssv(i32 inreg %val.0, i32 inreg %val.1, i32 %val.2) #0 { +; CHECK-LABEL: test_fmamix_constant_bus_violation_ssv: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_lshr_b32 s5, s7, 16 +; CHECK-NEXT: s_lshr_b32 s4, s6, 16 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: v_fma_mix_f32 v0, s4, v1, v0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %lshr.0 = lshr i32 %val.0, 16 + %lshr.1 = lshr i32 %val.1, 16 + %lshr.2 = lshr i32 %val.2, 16 + %trunc.0 = trunc i32 %lshr.0 to i16 + %trunc.1 = trunc i32 %lshr.1 to i16 + %trunc.2 = trunc i32 %lshr.2 to i16 + %cast.0 = bitcast i16 %trunc.0 to half + %cast.1 = bitcast i16 %trunc.1 to half + %cast.2 = bitcast i16 %trunc.2 to half + %fpext.0 = fpext half %cast.0 to float + %fpext.1 = fpext half %cast.1 to float + %fpext.2 = fpext half %cast.2 to float + %fma = call float @llvm.fma.f32(float %fpext.0, float %fpext.1, float %fpext.2) + ret float %fma +} + +define float @test_fmamix_constant_bus_violation_svs(i32 inreg %val.0, i32 %val.1, i32 inreg %val.2) #0 { +; CHECK-LABEL: test_fmamix_constant_bus_violation_svs: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_lshr_b32 s5, s7, 16 +; CHECK-NEXT: s_lshr_b32 s4, s6, 16 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: v_fma_mix_f32 v0, s4, v0, v1 op_sel:[0,1,0] op_sel_hi:[1,1,1] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %lshr.0 = lshr i32 %val.0, 16 + %lshr.1 = lshr i32 %val.1, 16 + %lshr.2 = lshr i32 %val.2, 16 + %trunc.0 = trunc i32 %lshr.0 to i16 + %trunc.1 = trunc i32 %lshr.1 to i16 + %trunc.2 = trunc i32 %lshr.2 to i16 + %cast.0 = bitcast i16 %trunc.0 to half + %cast.1 = bitcast i16 %trunc.1 to half + %cast.2 = bitcast i16 %trunc.2 to half + %fpext.0 = fpext half %cast.0 to float + %fpext.1 = fpext half %cast.1 to float + %fpext.2 = fpext half %cast.2 to float + %fma = call float @llvm.fma.f32(float %fpext.0, float %fpext.1, float %fpext.2) + ret float %fma +} + +define float @test_fmamix_constant_bus_violation_vss(i32 %val.0, i32 inreg %val.1, i32 inreg %val.2) #0 { +; CHECK-LABEL: test_fmamix_constant_bus_violation_vss: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_lshr_b32 s5, s7, 16 +; CHECK-NEXT: s_lshr_b32 s4, s6, 16 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: v_fma_mix_f32 v0, v0, s4, v1 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %lshr.0 = lshr i32 %val.0, 16 + %lshr.1 = lshr i32 %val.1, 16 + %lshr.2 = lshr i32 %val.2, 16 + %trunc.0 = trunc i32 %lshr.0 to i16 + %trunc.1 = trunc i32 %lshr.1 to i16 + %trunc.2 = trunc i32 %lshr.2 to i16 + %cast.0 = bitcast i16 %trunc.0 to half + %cast.1 = bitcast i16 %trunc.1 to half + %cast.2 = bitcast i16 %trunc.2 to half + %fpext.0 = fpext half %cast.0 to float + %fpext.1 = fpext half %cast.1 to float + %fpext.2 = fpext half %cast.2 to float + %fma = call float @llvm.fma.f32(float %fpext.0, float %fpext.1, float %fpext.2) + ret float %fma +} + +attributes #0 = { "denormal-fp-math-f32"="preserve-sign" } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.mir new file mode 100644 index 0000000000000..8cc1e608687fd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.mir @@ -0,0 +1,42 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=instruction-select,machineverifier -o - %s | FileCheck -check-prefixes=GFX9 %s + +--- +name: foo +legalized: true +regBankSelected: true +machineFunctionInfo: + mode: + fp32-output-denormals: false + fp32-input-denormals: false +body: | + bb.0: + ; GFX9-LABEL: name: foo + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 + ; GFX9-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY]], [[S_MOV_B32_]], implicit-def dead $scc + ; GFX9-NEXT: [[S_LSHR_B32_1:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def dead $scc + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_]] + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_1]] + ; GFX9-NEXT: [[V_FMA_MIX_F32_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIX_F32 9, [[COPY3]], 8, [[COPY3]], 8, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_FMA_MIX_F32_]], implicit $exec + ; GFX9-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = COPY $sgpr1 + %2:sgpr(s32) = G_CONSTANT i32 16 + %3:sgpr(s32) = G_LSHR %0:sgpr, %2:sgpr(s32) + %4:sgpr(s16) = G_TRUNC %3:sgpr(s32) + %5:sgpr(s32) = G_LSHR %1:sgpr, %2:sgpr(s32) + %6:sgpr(s16) = G_TRUNC %5:sgpr(s32) + %7:vgpr(s16) = COPY %4:sgpr(s16) + %8:vgpr(s32) = G_FPEXT %7:vgpr(s16) + %9:vgpr(s16) = COPY %6:sgpr(s16) + %10:vgpr(s32) = G_FPEXT %9:vgpr(s16) + %11:vgpr(s32) = G_FNEG %10:vgpr + %12:vgpr(s32) = G_FMA %11:vgpr, %10:vgpr, %8:vgpr + %13:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), %12:vgpr(s32) + $sgpr0 = COPY %13:sgpr(s32) + SI_RETURN_TO_EPILOG implicit $sgpr0 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.ll new file mode 100644 index 0000000000000..fabddb3cb84a5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.ll @@ -0,0 +1,106 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 %s -o - | FileCheck %s + +define float @test_fmamix_constant_bus_violation_sss(i32 inreg %val.0, i32 inreg %val.1, i32 inreg %val.2) #0 { +; CHECK-LABEL: test_fmamix_constant_bus_violation_sss: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_lshr_b32 s4, s6, 16 +; CHECK-NEXT: s_lshr_b32 s5, s7, 16 +; CHECK-NEXT: s_lshr_b32 s6, s16, 16 +; CHECK-NEXT: v_mov_b32_e32 v0, s5 +; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: v_mad_mix_f32 v0, s4, v0, v1 op_sel_hi:[1,1,1] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %lshr.0 = lshr i32 %val.0, 16 + %lshr.1 = lshr i32 %val.1, 16 + %lshr.2 = lshr i32 %val.2, 16 + %trunc.0 = trunc i32 %lshr.0 to i16 + %trunc.1 = trunc i32 %lshr.1 to i16 + %trunc.2 = trunc i32 %lshr.2 to i16 + %cast.0 = bitcast i16 %trunc.0 to half + %cast.1 = bitcast i16 %trunc.1 to half + %cast.2 = bitcast i16 %trunc.2 to half + %fpext.0 = fpext half %cast.0 to float + %fpext.1 = fpext half %cast.1 to float + %fpext.2 = fpext half %cast.2 to float + %fma = call float @llvm.fmuladd.f32(float %fpext.0, float %fpext.1, float %fpext.2) + ret float %fma +} + +define float @test_fmamix_constant_bus_violation_ssv(i32 inreg %val.0, i32 inreg %val.1, i32 %val.2) #0 { +; CHECK-LABEL: test_fmamix_constant_bus_violation_ssv: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_lshr_b32 s5, s7, 16 +; CHECK-NEXT: s_lshr_b32 s4, s6, 16 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: v_mad_mix_f32 v0, s4, v1, v0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %lshr.0 = lshr i32 %val.0, 16 + %lshr.1 = lshr i32 %val.1, 16 + %lshr.2 = lshr i32 %val.2, 16 + %trunc.0 = trunc i32 %lshr.0 to i16 + %trunc.1 = trunc i32 %lshr.1 to i16 + %trunc.2 = trunc i32 %lshr.2 to i16 + %cast.0 = bitcast i16 %trunc.0 to half + %cast.1 = bitcast i16 %trunc.1 to half + %cast.2 = bitcast i16 %trunc.2 to half + %fpext.0 = fpext half %cast.0 to float + %fpext.1 = fpext half %cast.1 to float + %fpext.2 = fpext half %cast.2 to float + %fma = call float @llvm.fmuladd.f32(float %fpext.0, float %fpext.1, float %fpext.2) + ret float %fma +} + +define float @test_fmamix_constant_bus_violation_svs(i32 inreg %val.0, i32 %val.1, i32 inreg %val.2) #0 { +; CHECK-LABEL: test_fmamix_constant_bus_violation_svs: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_lshr_b32 s5, s7, 16 +; CHECK-NEXT: s_lshr_b32 s4, s6, 16 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: v_mad_mix_f32 v0, s4, v0, v1 op_sel:[0,1,0] op_sel_hi:[1,1,1] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %lshr.0 = lshr i32 %val.0, 16 + %lshr.1 = lshr i32 %val.1, 16 + %lshr.2 = lshr i32 %val.2, 16 + %trunc.0 = trunc i32 %lshr.0 to i16 + %trunc.1 = trunc i32 %lshr.1 to i16 + %trunc.2 = trunc i32 %lshr.2 to i16 + %cast.0 = bitcast i16 %trunc.0 to half + %cast.1 = bitcast i16 %trunc.1 to half + %cast.2 = bitcast i16 %trunc.2 to half + %fpext.0 = fpext half %cast.0 to float + %fpext.1 = fpext half %cast.1 to float + %fpext.2 = fpext half %cast.2 to float + %fma = call float @llvm.fmuladd.f32(float %fpext.0, float %fpext.1, float %fpext.2) + ret float %fma +} + +define float @test_fmamix_constant_bus_violation_vss(i32 %val.0, i32 inreg %val.1, i32 inreg %val.2) #0 { +; CHECK-LABEL: test_fmamix_constant_bus_violation_vss: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_lshr_b32 s5, s7, 16 +; CHECK-NEXT: s_lshr_b32 s4, s6, 16 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: v_mad_mix_f32 v0, v0, s4, v1 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %lshr.0 = lshr i32 %val.0, 16 + %lshr.1 = lshr i32 %val.1, 16 + %lshr.2 = lshr i32 %val.2, 16 + %trunc.0 = trunc i32 %lshr.0 to i16 + %trunc.1 = trunc i32 %lshr.1 to i16 + %trunc.2 = trunc i32 %lshr.2 to i16 + %cast.0 = bitcast i16 %trunc.0 to half + %cast.1 = bitcast i16 %trunc.1 to half + %cast.2 = bitcast i16 %trunc.2 to half + %fpext.0 = fpext half %cast.0 to float + %fpext.1 = fpext half %cast.1 to float + %fpext.2 = fpext half %cast.2 to float + %fma = call float @llvm.fmuladd.f32(float %fpext.0, float %fpext.1, float %fpext.2) + ret float %fma +} + +attributes #0 = { "denormal-fp-math-f32"="preserve-sign" } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.mir new file mode 100644 index 0000000000000..4d611c15c868f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.mir @@ -0,0 +1,42 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select,machineverifier -o - %s | FileCheck -check-prefixes=GFX9 %s + +--- +name: foo +legalized: true +regBankSelected: true +machineFunctionInfo: + mode: + fp32-output-denormals: false + fp32-input-denormals: false +body: | + bb.0: + ; GFX9-LABEL: name: foo + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 + ; GFX9-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY]], [[S_MOV_B32_]], implicit-def dead $scc + ; GFX9-NEXT: [[S_LSHR_B32_1:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def dead $scc + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_]] + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_1]] + ; GFX9-NEXT: [[V_MAD_MIX_F32_:%[0-9]+]]:vgpr_32 = V_MAD_MIX_F32 9, [[COPY3]], 8, [[COPY3]], 8, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MAD_MIX_F32_]], implicit $exec + ; GFX9-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = COPY $sgpr1 + %2:sgpr(s32) = G_CONSTANT i32 16 + %3:sgpr(s32) = G_LSHR %0:sgpr, %2:sgpr(s32) + %4:sgpr(s16) = G_TRUNC %3:sgpr(s32) + %5:sgpr(s32) = G_LSHR %1:sgpr, %2:sgpr(s32) + %6:sgpr(s16) = G_TRUNC %5:sgpr(s32) + %7:vgpr(s16) = COPY %4:sgpr(s16) + %8:vgpr(s32) = G_FPEXT %7:vgpr(s16) + %9:vgpr(s16) = COPY %6:sgpr(s16) + %10:vgpr(s32) = G_FPEXT %9:vgpr(s16) + %11:vgpr(s32) = G_FNEG %10:vgpr + %12:vgpr(s32) = G_FMAD %11:vgpr, %10:vgpr, %8:vgpr + %13:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), %12:vgpr(s32) + $sgpr0 = COPY %13:sgpr(s32) + SI_RETURN_TO_EPILOG implicit $sgpr0 +...