From 94ed0349e89d1aefa8088609a1a88a36bc5d6524 Mon Sep 17 00:00:00 2001 From: tomershafir Date: Sun, 31 Aug 2025 21:35:08 +0300 Subject: [PATCH 1/3] [AArch64] Lower zero cycle FPR zeroing Lower FPR64, FPR32, FPR16 from `fmov` zeroing into NEON zeroing if the target supports zero cycle zeroing of NEON registers but not for the narrower classes. It handles 2 cases: one in `AsmPrinter` where a FP zeroing from immediate has been captured by pattern matching on instruction selection, and second post RA in `AArch64InstrInfo::copyPhysReg` for uncaptured/later-generated WZR/XZR fmovs. Adds a subtarget feature called FeatureZCZeroingFPR128 that enables to query wether the target supports zero cycle zeroing for FPR128 NEON registers, and updates the appropriate processors. --- llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp | 105 +++++++----- llvm/lib/Target/AArch64/AArch64Features.td | 3 + llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 43 ++++- llvm/lib/Target/AArch64/AArch64Processors.td | 38 +++-- .../CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll | 4 +- .../AArch64/arm64-zero-cycle-zeroing-fpr.ll | 48 ++++-- .../AArch64/dag-combine-concat-vectors.ll | 2 +- llvm/test/CodeGen/AArch64/expand-select.ll | 4 +- llvm/test/CodeGen/AArch64/ext-narrow-index.ll | 2 +- llvm/test/CodeGen/AArch64/fsh.ll | 30 ++-- llvm/test/CodeGen/AArch64/neon-dotreduce.ll | 152 +++++++++--------- .../AArch64/vec-combine-compare-to-bitmask.ll | 4 +- llvm/test/CodeGen/AArch64/vecreduce-add.ll | 8 +- 13 files changed, 274 insertions(+), 169 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index fa050526b722c..c31a090bba77f 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -307,6 +307,7 @@ class AArch64AsmPrinter : public AsmPrinter { /// Emit instruction to set float register to zero. void emitFMov0(const MachineInstr &MI); + void emitFMov0AsFMov(const MachineInstr &MI, Register DestReg); using MInstToMCSymbol = std::map; @@ -1829,45 +1830,77 @@ void AArch64AsmPrinter::emitMOVK(Register Dest, uint64_t Imm, unsigned Shift) { void AArch64AsmPrinter::emitFMov0(const MachineInstr &MI) { Register DestReg = MI.getOperand(0).getReg(); - if (STI->hasZeroCycleZeroingFPR64() && - !STI->hasZeroCycleZeroingFPWorkaround() && STI->isNeonAvailable()) { - // Convert H/S register to corresponding D register - if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31) - DestReg = AArch64::D0 + (DestReg - AArch64::H0); - else if (AArch64::S0 <= DestReg && DestReg <= AArch64::S31) - DestReg = AArch64::D0 + (DestReg - AArch64::S0); - else - assert(AArch64::D0 <= DestReg && DestReg <= AArch64::D31); + if (!STI->hasZeroCycleZeroingFPWorkaround() && STI->isNeonAvailable()) { + if (STI->hasZeroCycleZeroingFPR64()) { + // Convert H/S register to corresponding D register + const AArch64RegisterInfo *TRI = STI->getRegisterInfo(); + if (AArch64::FPR16RegClass.contains(DestReg)) + DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::hsub, + &AArch64::FPR64RegClass); + else if (AArch64::FPR32RegClass.contains(DestReg)) + DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::ssub, + &AArch64::FPR64RegClass); + else + assert(AArch64::FPR64RegClass.contains(DestReg)); + + MCInst MOVI; + MOVI.setOpcode(AArch64::MOVID); + MOVI.addOperand(MCOperand::createReg(DestReg)); + MOVI.addOperand(MCOperand::createImm(0)); + EmitToStreamer(*OutStreamer, MOVI); + } else if (STI->hasZeroCycleZeroingFPR128()) { + // Convert H/S/D register to corresponding Q register + const AArch64RegisterInfo *TRI = STI->getRegisterInfo(); + if (AArch64::FPR16RegClass.contains(DestReg)) { + DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::hsub, + &AArch64::FPR128RegClass); + } else if (AArch64::FPR32RegClass.contains(DestReg)) { + DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::ssub, + &AArch64::FPR128RegClass); + } else { + assert(AArch64::FPR64RegClass.contains(DestReg)); + DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::dsub, + &AArch64::FPR128RegClass); + } - MCInst MOVI; - MOVI.setOpcode(AArch64::MOVID); - MOVI.addOperand(MCOperand::createReg(DestReg)); - MOVI.addOperand(MCOperand::createImm(0)); - EmitToStreamer(*OutStreamer, MOVI); - } else { - MCInst FMov; - switch (MI.getOpcode()) { - default: llvm_unreachable("Unexpected opcode"); - case AArch64::FMOVH0: - FMov.setOpcode(STI->hasFullFP16() ? AArch64::FMOVWHr : AArch64::FMOVWSr); - if (!STI->hasFullFP16()) - DestReg = (AArch64::S0 + (DestReg - AArch64::H0)); - FMov.addOperand(MCOperand::createReg(DestReg)); - FMov.addOperand(MCOperand::createReg(AArch64::WZR)); - break; - case AArch64::FMOVS0: - FMov.setOpcode(AArch64::FMOVWSr); - FMov.addOperand(MCOperand::createReg(DestReg)); - FMov.addOperand(MCOperand::createReg(AArch64::WZR)); - break; - case AArch64::FMOVD0: - FMov.setOpcode(AArch64::FMOVXDr); - FMov.addOperand(MCOperand::createReg(DestReg)); - FMov.addOperand(MCOperand::createReg(AArch64::XZR)); - break; + MCInst MOVI; + MOVI.setOpcode(AArch64::MOVIv2d_ns); + MOVI.addOperand(MCOperand::createReg(DestReg)); + MOVI.addOperand(MCOperand::createImm(0)); + EmitToStreamer(*OutStreamer, MOVI); + } else { + emitFMov0AsFMov(MI, DestReg); } - EmitToStreamer(*OutStreamer, FMov); + } else { + emitFMov0AsFMov(MI, DestReg); + } +} + +void AArch64AsmPrinter::emitFMov0AsFMov(const MachineInstr &MI, + Register DestReg) { + MCInst FMov; + switch (MI.getOpcode()) { + default: + llvm_unreachable("Unexpected opcode"); + case AArch64::FMOVH0: + FMov.setOpcode(STI->hasFullFP16() ? AArch64::FMOVWHr : AArch64::FMOVWSr); + if (!STI->hasFullFP16()) + DestReg = (AArch64::S0 + (DestReg - AArch64::H0)); + FMov.addOperand(MCOperand::createReg(DestReg)); + FMov.addOperand(MCOperand::createReg(AArch64::WZR)); + break; + case AArch64::FMOVS0: + FMov.setOpcode(AArch64::FMOVWSr); + FMov.addOperand(MCOperand::createReg(DestReg)); + FMov.addOperand(MCOperand::createReg(AArch64::WZR)); + break; + case AArch64::FMOVD0: + FMov.setOpcode(AArch64::FMOVXDr); + FMov.addOperand(MCOperand::createReg(DestReg)); + FMov.addOperand(MCOperand::createReg(AArch64::XZR)); + break; } + EmitToStreamer(*OutStreamer, FMov); } Register AArch64AsmPrinter::emitPtrauthDiscriminator(uint16_t Disc, diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td index 6904e09072649..46f5f0c1ca9dd 100644 --- a/llvm/lib/Target/AArch64/AArch64Features.td +++ b/llvm/lib/Target/AArch64/AArch64Features.td @@ -636,6 +636,9 @@ def FeatureZCZeroingGPR64 : SubtargetFeature<"zcz-gpr64", "HasZeroCycleZeroingGP def FeatureZCZeroingGPR32 : SubtargetFeature<"zcz-gpr32", "HasZeroCycleZeroingGPR32", "true", "Has zero-cycle zeroing instructions for GPR32 registers">; +def FeatureZCZeroingFPR128 : SubtargetFeature<"zcz-fpr128", "HasZeroCycleZeroingFPR128", "true", + "Has zero-cycle zeroing instructions for FPR128 registers">; + // It is generally beneficial to rewrite "fmov s0, wzr" to "movi d0, #0". // as movi is more efficient across all cores. Newer cores can eliminate // fmovs early and there is no difference with movi, but this not true for diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index b47ae5d2cbb17..da5e2ec02d649 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -5471,8 +5471,24 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Copies between GPR64 and FPR64. if (AArch64::FPR64RegClass.contains(DestReg) && AArch64::GPR64RegClass.contains(SrcReg)) { - BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); + if (AArch64::XZR == SrcReg && + !Subtarget.hasZeroCycleZeroingFPWorkaround() && + Subtarget.isNeonAvailable()) { + if (Subtarget.hasZeroCycleZeroingFPR64()) { + BuildMI(MBB, I, DL, get(AArch64::MOVID), DestReg).addImm(0); + } else if (Subtarget.hasZeroCycleZeroingFPR128()) { + const TargetRegisterInfo *TRI = &getRegisterInfo(); + MCRegister DestRegQ = TRI->getMatchingSuperReg( + DestReg, AArch64::dsub, &AArch64::FPR128RegClass); + BuildMI(MBB, I, DL, get(AArch64::MOVIv2d_ns), DestRegQ).addImm(0); + } else { + BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } + } else { + BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } return; } if (AArch64::GPR64RegClass.contains(DestReg) && @@ -5484,8 +5500,27 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Copies between GPR32 and FPR32. if (AArch64::FPR32RegClass.contains(DestReg) && AArch64::GPR32RegClass.contains(SrcReg)) { - BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); + if (AArch64::WZR == SrcReg && + !Subtarget.hasZeroCycleZeroingFPWorkaround() && + Subtarget.isNeonAvailable()) { + if (Subtarget.hasZeroCycleZeroingFPR64()) { + const TargetRegisterInfo *TRI = &getRegisterInfo(); + MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::ssub, + &AArch64::FPR64RegClass); + BuildMI(MBB, I, DL, get(AArch64::MOVID), DestRegD).addImm(0); + } else if (Subtarget.hasZeroCycleZeroingFPR128()) { + const TargetRegisterInfo *TRI = &getRegisterInfo(); + MCRegister DestRegQ = TRI->getMatchingSuperReg( + DestReg, AArch64::ssub, &AArch64::FPR128RegClass); + BuildMI(MBB, I, DL, get(AArch64::MOVIv2d_ns), DestRegQ).addImm(0); + } else { + BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } + } else { + BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } return; } if (AArch64::GPR32RegClass.contains(DestReg) && diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index d5f4e91ae5188..81f5d075729d9 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -344,6 +344,8 @@ def TuneAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7", FeatureZCRegMoveFPR128, FeatureZCZeroingGPR32, FeatureZCZeroingGPR64, + FeatureNoZCZeroingFPR64, + FeatureZCZeroingFPR128, FeatureZCZeroingFPWorkaround]>; def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10", @@ -358,7 +360,9 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10", FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, FeatureZCZeroingGPR32, - FeatureZCZeroingGPR64]>; + FeatureZCZeroingGPR64, + FeatureNoZCZeroingFPR64, + FeatureZCZeroingFPR128]>; def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11", "Apple A11", [ @@ -372,7 +376,9 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11", FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, FeatureZCZeroingGPR32, - FeatureZCZeroingGPR64]>; + FeatureZCZeroingGPR64, + FeatureNoZCZeroingFPR64, + FeatureZCZeroingFPR128]>; def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12", "Apple A12", [ @@ -386,7 +392,9 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12", FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, FeatureZCZeroingGPR32, - FeatureZCZeroingGPR64]>; + FeatureZCZeroingGPR64, + FeatureNoZCZeroingFPR64, + FeatureZCZeroingFPR128]>; def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13", "Apple A13", [ @@ -400,7 +408,9 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13", FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, FeatureZCZeroingGPR32, - FeatureZCZeroingGPR64]>; + FeatureZCZeroingGPR64, + FeatureNoZCZeroingFPR64, + FeatureZCZeroingFPR128]>; def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14", "Apple A14", [ @@ -419,7 +429,9 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14", FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, FeatureZCZeroingGPR32, - FeatureZCZeroingGPR64]>; + FeatureZCZeroingGPR64, + FeatureNoZCZeroingFPR64, + FeatureZCZeroingFPR128]>; def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15", "Apple A15", [ @@ -438,7 +450,9 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15", FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, FeatureZCZeroingGPR32, - FeatureZCZeroingGPR64]>; + FeatureZCZeroingGPR64, + FeatureNoZCZeroingFPR64, + FeatureZCZeroingFPR128]>; def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16", "Apple A16", [ @@ -457,7 +471,9 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16", FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, FeatureZCZeroingGPR32, - FeatureZCZeroingGPR64]>; + FeatureZCZeroingGPR64, + FeatureNoZCZeroingFPR64, + FeatureZCZeroingFPR128]>; def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17", "Apple A17", [ @@ -476,7 +492,9 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17", FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, FeatureZCZeroingGPR32, - FeatureZCZeroingGPR64]>; + FeatureZCZeroingGPR64, + FeatureNoZCZeroingFPR64, + FeatureZCZeroingFPR128]>; def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4", "Apple M4", [ @@ -494,7 +512,9 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4", FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, FeatureZCZeroingGPR32, - FeatureZCZeroingGPR64]>; + FeatureZCZeroingGPR64, + FeatureNoZCZeroingFPR64, + FeatureZCZeroingFPR128]>; def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3", "Samsung Exynos-M3 processors", diff --git a/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll b/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll index 7934e39b2b69f..78e20f2a5e214 100644 --- a/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll +++ b/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll @@ -69,14 +69,14 @@ define double @add_sub_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone { ; CHECK-LABEL: add_sub_su64: ; CHECK: // %bb.0: ; CHECK-NEXT: add d0, d1, d0 -; CHECK-NEXT: fmov d1, xzr +; CHECK-NEXT: movi d1, #0000000000000000 ; CHECK-NEXT: sub d0, d1, d0 ; CHECK-NEXT: ret ; ; GENERIC-LABEL: add_sub_su64: ; GENERIC: // %bb.0: ; GENERIC-NEXT: add d0, d1, d0 -; GENERIC-NEXT: fmov d1, xzr +; GENERIC-NEXT: movi d1, #0000000000000000 ; GENERIC-NEXT: sub d0, d1, d0 ; GENERIC-NEXT: ret %vecext = extractelement <2 x i64> %a, i32 0 diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll index 2a75976d58549..ccdaa8779e38f 100644 --- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll +++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll @@ -1,9 +1,10 @@ -; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64 -; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64,+fullfp16 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-FULLFP16 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-NOZCZ-FPR128 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64,+fullfp16 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-NOZCZ-FPR128-FULLFP16 ; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64 ; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+fullfp16 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64,+zcz-fpr128 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-ZCZ-FPR128 ; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -check-prefixes=ALL,FP-WORKAROUND -; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64 +; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-ZCZ-FPR128 ; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64 ; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=kryo | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64 ; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=falkor | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64 @@ -12,9 +13,10 @@ define half @tf16() { entry: ; ALL-LABEL: tf16: ; FP-WORKAROUND: mov s0, wzr -; NOZCZ-FPR64: mov s0, wzr -; NOZCZ-FPR64-FULLFP16: mov h0, wzr +; NOZCZ-FPR64-NOZCZ-FPR128: mov s0, wzr +; NOZCZ-FPR64-NOZCZ-FPR128-FULLFP16: mov h0, wzr ; ZCZ-FPR64: movi d0, #0 +; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 ret half 0.0 } @@ -22,8 +24,9 @@ define float @tf32() { entry: ; ALL-LABEL: tf32: ; FP-WORKAROUND: mov s0, wzr -; NOZCZ-FPR64: mov s0, wzr +; NOZCZ-FPR64-NOZCZ-FPR128: mov s0, wzr ; ZCZ-FPR64: movi d0, #0 +; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 ret float 0.0 } @@ -31,8 +34,9 @@ define double @td64() { entry: ; ALL-LABEL: td64: ; FP-WORKAROUND: mov d0, xzr -; NOZCZ-FPR64: mov d0, xzr +; NOZCZ-FPR64-NOZCZ-FPR128: mov d0, xzr ; ZCZ-FPR64: movi d0, #0 +; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 ret double 0.0 } @@ -40,8 +44,9 @@ define <8 x i8> @tv8i8() { entry: ; ALL-LABEL: tv8i8: ; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 -; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 ; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 ret <8 x i8> } @@ -49,8 +54,9 @@ define <4 x i16> @tv4i16() { entry: ; ALL-LABEL: tv4i16: ; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 -; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 ; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 ret <4 x i16> } @@ -58,8 +64,9 @@ define <2 x i32> @tv2i32() { entry: ; ALL-LABEL: tv2i32: ; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 -; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 ; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 ret <2 x i32> } @@ -67,8 +74,9 @@ define <2 x float> @tv2f32() { entry: ; ALL-LABEL: tv2f32: ; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 -; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 ; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 ret <2 x float> } @@ -76,8 +84,9 @@ define <16 x i8> @tv16i8() { entry: ; ALL-LABEL: tv16i8: ; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 -; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 ; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 ret <16 x i8> } @@ -85,8 +94,9 @@ define <8 x i16> @tv8i16() { entry: ; ALL-LABEL: tv8i16: ; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 -; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 ; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 ret <8 x i16> } @@ -94,8 +104,9 @@ define <4 x i32> @tv4i32() { entry: ; ALL-LABEL: tv4i32: ; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 -; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 ; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 ret <4 x i32> } @@ -103,8 +114,9 @@ define <2 x i64> @tv2i64() { entry: ; ALL-LABEL: tv2i64: ; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 -; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 ; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 ret <2 x i64> } @@ -112,8 +124,9 @@ define <4 x float> @tv4f32() { entry: ; ALL-LABEL: tv4f32: ; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 -; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 ; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 ret <4 x float> } @@ -121,8 +134,9 @@ define <2 x double> @tv2d64() { entry: ; ALL-LABEL: tv2d64: ; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 -; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 ; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 ret <2 x double> } diff --git a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll index 53126a08db86f..c0c31427307b5 100644 --- a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll +++ b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll @@ -8,7 +8,7 @@ declare void @llvm.masked.scatter.nxv16i8.nxv16p0(, %In2, <2 x i128> %In3, ptr %Out) { ; CHECK-LABEL: foo: ; CHECK: // %bb.0: +; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: and w8, w0, #0x1 -; CHECK-NEXT: fmov s0, wzr ; CHECK-NEXT: ldr x11, [sp] ; CHECK-NEXT: fmov s1, w8 ; CHECK-NEXT: ldp x8, x10, [sp, #8] @@ -31,8 +31,8 @@ define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, ptr %Out) { define void @bar(i32 %In1, <2 x i96> %In2, <2 x i96> %In3, ptr %Out) { ; CHECK-LABEL: bar: ; CHECK: // %bb.0: +; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: and w8, w0, #0x1 -; CHECK-NEXT: fmov s0, wzr ; CHECK-NEXT: ldr x10, [sp, #16] ; CHECK-NEXT: fmov s1, w8 ; CHECK-NEXT: cmeq v0.4s, v1.4s, v0.4s diff --git a/llvm/test/CodeGen/AArch64/ext-narrow-index.ll b/llvm/test/CodeGen/AArch64/ext-narrow-index.ll index 177f2cafcf833..f62cfef9baf28 100644 --- a/llvm/test/CodeGen/AArch64/ext-narrow-index.ll +++ b/llvm/test/CodeGen/AArch64/ext-narrow-index.ll @@ -382,7 +382,7 @@ entry: define <1 x i64> @i64_zero_off2(<2 x i64> %arg1) { ; CHECK-LABEL: i64_zero_off2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov d0, xzr +; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i64> %arg1, <2 x i64> zeroinitializer, <1 x i32> diff --git a/llvm/test/CodeGen/AArch64/fsh.ll b/llvm/test/CodeGen/AArch64/fsh.ll index ae2ef2649102e..765f6b77b41a9 100644 --- a/llvm/test/CodeGen/AArch64/fsh.ll +++ b/llvm/test/CodeGen/AArch64/fsh.ll @@ -1379,7 +1379,7 @@ define <7 x i32> @rotl_v7i32(<7 x i32> %a, <7 x i32> %c) { ; CHECK-GI-LABEL: rotl_v7i32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldr s0, [sp, #24] -; CHECK-GI-NEXT: fmov s1, wzr +; CHECK-GI-NEXT: movi d1, #0000000000000000 ; CHECK-GI-NEXT: fmov s3, w7 ; CHECK-GI-NEXT: ldr s2, [sp, #32] ; CHECK-GI-NEXT: mov x8, sp @@ -1387,31 +1387,32 @@ define <7 x i32> @rotl_v7i32(<7 x i32> %a, <7 x i32> %c) { ; CHECK-GI-NEXT: mov v6.16b, v0.16b ; CHECK-GI-NEXT: ldr s7, [sp] ; CHECK-GI-NEXT: ldr s5, [sp, #40] -; CHECK-GI-NEXT: mov v1.s[1], wzr ; CHECK-GI-NEXT: ld1 { v3.s }[1], [x8] ; CHECK-GI-NEXT: add x8, sp, #8 +; CHECK-GI-NEXT: fmov s16, w0 +; CHECK-GI-NEXT: mov v1.s[1], wzr ; CHECK-GI-NEXT: mov v4.s[1], v7.s[0] ; CHECK-GI-NEXT: ldr s7, [sp, #8] -; CHECK-GI-NEXT: fmov s16, w0 ; CHECK-GI-NEXT: mov v6.s[1], v2.s[0] ; CHECK-GI-NEXT: fmov s17, w0 ; CHECK-GI-NEXT: add x9, sp, #16 ; CHECK-GI-NEXT: ld1 { v3.s }[2], [x8] ; CHECK-GI-NEXT: mov w8, #31 // =0x1f +; CHECK-GI-NEXT: mov v16.s[1], w1 +; CHECK-GI-NEXT: fmov s18, w8 ; CHECK-GI-NEXT: mov v0.s[1], v2.s[0] +; CHECK-GI-NEXT: fmov s2, w4 ; CHECK-GI-NEXT: mov v1.s[2], wzr -; CHECK-GI-NEXT: fmov s18, w8 -; CHECK-GI-NEXT: mov v16.s[1], w1 ; CHECK-GI-NEXT: mov v4.s[2], v7.s[0] ; CHECK-GI-NEXT: ldr s7, [sp, #16] -; CHECK-GI-NEXT: mov v17.s[1], w1 ; CHECK-GI-NEXT: mov v6.s[2], v5.s[0] ; CHECK-GI-NEXT: ld1 { v3.s }[3], [x9] -; CHECK-GI-NEXT: fmov s2, w4 +; CHECK-GI-NEXT: mov v17.s[1], w1 ; CHECK-GI-NEXT: mov v18.s[1], w8 ; CHECK-GI-NEXT: movi v19.4s, #31 -; CHECK-GI-NEXT: mov v0.s[2], v5.s[0] ; CHECK-GI-NEXT: mov v16.s[2], w2 +; CHECK-GI-NEXT: mov v2.s[1], w5 +; CHECK-GI-NEXT: mov v0.s[2], v5.s[0] ; CHECK-GI-NEXT: mov v4.s[3], v7.s[0] ; CHECK-GI-NEXT: fmov s7, w4 ; CHECK-GI-NEXT: neg v3.4s, v3.4s @@ -1419,15 +1420,14 @@ define <7 x i32> @rotl_v7i32(<7 x i32> %a, <7 x i32> %c) { ; CHECK-GI-NEXT: fmov s6, w8 ; CHECK-GI-NEXT: mov v17.s[2], w2 ; CHECK-GI-NEXT: mov v18.s[2], w8 -; CHECK-GI-NEXT: mov v2.s[1], w5 +; CHECK-GI-NEXT: mov v16.s[3], w3 ; CHECK-GI-NEXT: mov v7.s[1], w5 ; CHECK-GI-NEXT: and v3.16b, v3.16b, v19.16b -; CHECK-GI-NEXT: mov v16.s[3], w3 +; CHECK-GI-NEXT: mov v2.s[2], w6 ; CHECK-GI-NEXT: mov v6.s[1], w8 ; CHECK-GI-NEXT: and v4.16b, v4.16b, v19.16b ; CHECK-GI-NEXT: mov v17.s[3], w3 ; CHECK-GI-NEXT: and v1.16b, v1.16b, v18.16b -; CHECK-GI-NEXT: mov v2.s[2], w6 ; CHECK-GI-NEXT: neg v3.4s, v3.4s ; CHECK-GI-NEXT: mov v7.s[2], w6 ; CHECK-GI-NEXT: mov v6.s[2], w8 @@ -1510,7 +1510,7 @@ define <7 x i32> @rotr_v7i32(<7 x i32> %a, <7 x i32> %c) { ; CHECK-GI-NEXT: fmov s2, w7 ; CHECK-GI-NEXT: mov x8, sp ; CHECK-GI-NEXT: ldr s6, [sp, #8] -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: ldr s7, [sp, #32] ; CHECK-GI-NEXT: fmov s16, w0 ; CHECK-GI-NEXT: fmov s17, w0 @@ -1518,12 +1518,12 @@ define <7 x i32> @rotr_v7i32(<7 x i32> %a, <7 x i32> %c) { ; CHECK-GI-NEXT: ldr s3, [sp, #24] ; CHECK-GI-NEXT: ld1 { v2.s }[1], [x8] ; CHECK-GI-NEXT: mov w8, #31 // =0x1f -; CHECK-GI-NEXT: mov v0.s[1], wzr ; CHECK-GI-NEXT: add x9, sp, #8 +; CHECK-GI-NEXT: ldr s5, [sp, #40] ; CHECK-GI-NEXT: mov v4.16b, v3.16b ; CHECK-GI-NEXT: mov v3.s[1], v7.s[0] +; CHECK-GI-NEXT: mov v0.s[1], wzr ; CHECK-GI-NEXT: fmov s18, w8 -; CHECK-GI-NEXT: ldr s5, [sp, #40] ; CHECK-GI-NEXT: ld1 { v2.s }[2], [x9] ; CHECK-GI-NEXT: mov v17.s[1], w1 ; CHECK-GI-NEXT: mov v1.s[2], v6.s[0] @@ -1531,9 +1531,9 @@ define <7 x i32> @rotr_v7i32(<7 x i32> %a, <7 x i32> %c) { ; CHECK-GI-NEXT: mov v16.s[1], w1 ; CHECK-GI-NEXT: mov v4.s[1], v7.s[0] ; CHECK-GI-NEXT: ldr s7, [sp, #16] +; CHECK-GI-NEXT: fmov s19, w4 ; CHECK-GI-NEXT: mov v18.s[1], w8 ; CHECK-GI-NEXT: mov v3.s[2], v5.s[0] -; CHECK-GI-NEXT: fmov s19, w4 ; CHECK-GI-NEXT: add x10, sp, #16 ; CHECK-GI-NEXT: mov v6.s[1], w8 ; CHECK-GI-NEXT: mov v0.s[2], wzr diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll index 88b6f6c40baca..fb2a1fa697c26 100644 --- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll +++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll @@ -2400,7 +2400,7 @@ define i32 @test_udot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b ; CHECK-GI-NEXT: .cfi_offset w30, -88 ; CHECK-GI-NEXT: .cfi_offset w29, -96 ; CHECK-GI-NEXT: ldp q2, q1, [x1] -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: str w2, [sp, #12] // 4-byte Folded Spill ; CHECK-GI-NEXT: mov b6, v2.b[3] ; CHECK-GI-NEXT: mov b7, v2.b[4] @@ -2710,7 +2710,7 @@ define i32 @test_udot_v25i8_nomla(ptr nocapture readonly %a1) { ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 ; CHECK-GI-NEXT: .cfi_offset w19, -16 ; CHECK-GI-NEXT: ldp q2, q1, [x0] -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: umov w15, v2.b[0] ; CHECK-GI-NEXT: umov w17, v2.b[4] ; CHECK-GI-NEXT: umov w0, v2.b[8] @@ -2830,7 +2830,7 @@ define i32 @test_sdot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b ; CHECK-GI-NEXT: .cfi_offset w30, -88 ; CHECK-GI-NEXT: .cfi_offset w29, -96 ; CHECK-GI-NEXT: ldp q2, q1, [x1] -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: str w2, [sp, #12] // 4-byte Folded Spill ; CHECK-GI-NEXT: mov b5, v2.b[2] ; CHECK-GI-NEXT: mov b6, v2.b[3] @@ -3360,12 +3360,12 @@ define i32 @test_sdot_v25i8_double(<25 x i8> %a, <25 x i8> %b, <25 x i8> %c, <25 ; CHECK-GI-NEXT: sbfx w9, w11, #8, #8 ; CHECK-GI-NEXT: lsl w11, w3, #8 ; CHECK-GI-NEXT: sbfx w14, w14, #8, #8 -; CHECK-GI-NEXT: fmov s1, wzr +; CHECK-GI-NEXT: movi d1, #0000000000000000 ; CHECK-GI-NEXT: lsl w10, w10, #8 ; CHECK-GI-NEXT: mov v4.h[1], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #152] ; CHECK-GI-NEXT: sbfx w11, w11, #8, #8 -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: mov v2.h[2], w9 ; CHECK-GI-NEXT: ldr w9, [sp, #40] ; CHECK-GI-NEXT: sbfx w10, w10, #8, #8 @@ -4012,25 +4012,24 @@ define i32 @test_sdot_v25i8_double_nomla(<25 x i8> %a, <25 x i8> %b, <25 x i8> % ; CHECK-GI-NEXT: sxtb w8, w4 ; CHECK-GI-NEXT: sxtb w10, w10 ; CHECK-GI-NEXT: ldr w14, [sp, #448] -; CHECK-GI-NEXT: fmov s1, wzr -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d1, #0000000000000000 +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: fmov s3, w8 ; CHECK-GI-NEXT: sxtb w8, w2 ; CHECK-GI-NEXT: fmov s5, w10 ; CHECK-GI-NEXT: mov v2.s[1], w9 ; CHECK-GI-NEXT: sxtb w9, w5 ; CHECK-GI-NEXT: ldr w10, [sp, #80] -; CHECK-GI-NEXT: mov v1.s[1], wzr -; CHECK-GI-NEXT: mov v0.s[1], wzr ; CHECK-GI-NEXT: mov v3.s[1], w9 ; CHECK-GI-NEXT: ldr w9, [sp, #16] ; CHECK-GI-NEXT: sxtb w10, w10 +; CHECK-GI-NEXT: mov v1.s[1], wzr +; CHECK-GI-NEXT: mov v0.s[1], wzr ; CHECK-GI-NEXT: mov v2.s[2], w8 ; CHECK-GI-NEXT: sxtb w9, w9 ; CHECK-GI-NEXT: ldr w8, [sp, #24] ; CHECK-GI-NEXT: fmov s6, w10 ; CHECK-GI-NEXT: ldr w10, [sp, #64] -; CHECK-GI-NEXT: mov v1.s[2], wzr ; CHECK-GI-NEXT: mov v3.s[2], w11 ; CHECK-GI-NEXT: fmov s4, w9 ; CHECK-GI-NEXT: sxtb w8, w8 @@ -4039,7 +4038,7 @@ define i32 @test_sdot_v25i8_double_nomla(<25 x i8> %a, <25 x i8> %b, <25 x i8> % ; CHECK-GI-NEXT: sxtb w10, w10 ; CHECK-GI-NEXT: mov v2.s[3], w12 ; CHECK-GI-NEXT: ldr w12, [sp, #88] -; CHECK-GI-NEXT: mov v0.s[2], wzr +; CHECK-GI-NEXT: mov v1.s[2], wzr ; CHECK-GI-NEXT: mov v4.s[1], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #120] ; CHECK-GI-NEXT: sxtb w9, w9 @@ -4063,7 +4062,7 @@ define i32 @test_sdot_v25i8_double_nomla(<25 x i8> %a, <25 x i8> %b, <25 x i8> % ; CHECK-GI-NEXT: ldr w10, [sp, #136] ; CHECK-GI-NEXT: sxtb w13, w13 ; CHECK-GI-NEXT: sxtb w9, w9 -; CHECK-GI-NEXT: mov v1.s[3], wzr +; CHECK-GI-NEXT: mov v0.s[2], wzr ; CHECK-GI-NEXT: mov v7.s[1], w8 ; CHECK-GI-NEXT: sxtb w10, w10 ; CHECK-GI-NEXT: ldr w8, [sp, #72] @@ -4072,8 +4071,9 @@ define i32 @test_sdot_v25i8_double_nomla(<25 x i8> %a, <25 x i8> %b, <25 x i8> % ; CHECK-GI-NEXT: mov v4.s[3], w9 ; CHECK-GI-NEXT: ldr w9, [sp, #360] ; CHECK-GI-NEXT: sxtb w8, w8 -; CHECK-GI-NEXT: mov v0.s[3], wzr +; CHECK-GI-NEXT: mov v1.s[3], wzr ; CHECK-GI-NEXT: sxtb w13, w13 +; CHECK-GI-NEXT: mov v0.s[3], wzr ; CHECK-GI-NEXT: add v2.4s, v2.4s, v3.4s ; CHECK-GI-NEXT: mov v7.s[2], w12 ; CHECK-GI-NEXT: ldr w12, [sp, #352] @@ -4562,13 +4562,13 @@ define i32 @test_udot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b ; CHECK-GI-NEXT: .cfi_offset w30, -88 ; CHECK-GI-NEXT: .cfi_offset w29, -96 ; CHECK-GI-NEXT: ldp q7, q16, [x1] -; CHECK-GI-NEXT: fmov s5, wzr +; CHECK-GI-NEXT: movi d5, #0000000000000000 ; CHECK-GI-NEXT: str w2, [sp, #12] // 4-byte Folded Spill -; CHECK-GI-NEXT: fmov s6, wzr -; CHECK-GI-NEXT: fmov s0, wzr -; CHECK-GI-NEXT: fmov s1, wzr -; CHECK-GI-NEXT: fmov s3, wzr -; CHECK-GI-NEXT: fmov s2, wzr +; CHECK-GI-NEXT: movi d6, #0000000000000000 +; CHECK-GI-NEXT: movi d0, #0000000000000000 +; CHECK-GI-NEXT: movi d1, #0000000000000000 +; CHECK-GI-NEXT: movi d3, #0000000000000000 +; CHECK-GI-NEXT: movi d2, #0000000000000000 ; CHECK-GI-NEXT: mov b23, v7.b[7] ; CHECK-GI-NEXT: mov b17, v7.b[1] ; CHECK-GI-NEXT: fmov w11, s7 @@ -4822,7 +4822,7 @@ define i32 @test_udot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b ; CHECK-GI-NEXT: mov v19.h[6], w11 ; CHECK-GI-NEXT: fmov w12, s16 ; CHECK-GI-NEXT: fmov w11, s7 -; CHECK-GI-NEXT: fmov s4, wzr +; CHECK-GI-NEXT: movi d4, #0000000000000000 ; CHECK-GI-NEXT: uxtb w9, w9 ; CHECK-GI-NEXT: mov v20.h[6], w10 ; CHECK-GI-NEXT: umov w10, v24.h[0] @@ -4991,13 +4991,13 @@ define i32 @test_udot_v33i8_nomla(ptr nocapture readonly %a1) { ; CHECK-GI-NEXT: .cfi_offset w19, -8 ; CHECK-GI-NEXT: .cfi_offset w20, -16 ; CHECK-GI-NEXT: ldp q7, q19, [x0] -; CHECK-GI-NEXT: fmov s1, wzr +; CHECK-GI-NEXT: movi d1, #0000000000000000 ; CHECK-GI-NEXT: ldrb w10, [x0, #32] -; CHECK-GI-NEXT: fmov s0, wzr -; CHECK-GI-NEXT: fmov s3, wzr -; CHECK-GI-NEXT: fmov s2, wzr -; CHECK-GI-NEXT: fmov s5, wzr -; CHECK-GI-NEXT: fmov s4, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 +; CHECK-GI-NEXT: movi d3, #0000000000000000 +; CHECK-GI-NEXT: movi d2, #0000000000000000 +; CHECK-GI-NEXT: movi d5, #0000000000000000 +; CHECK-GI-NEXT: movi d4, #0000000000000000 ; CHECK-GI-NEXT: umov w15, v7.b[8] ; CHECK-GI-NEXT: umov w2, v7.b[12] ; CHECK-GI-NEXT: umov w16, v7.b[9] @@ -5022,13 +5022,13 @@ define i32 @test_udot_v33i8_nomla(ptr nocapture readonly %a1) { ; CHECK-GI-NEXT: mov v17.s[1], w16 ; CHECK-GI-NEXT: mov v18.s[1], w4 ; CHECK-GI-NEXT: umov w4, v19.b[4] +; CHECK-GI-NEXT: movi d6, #0000000000000000 ; CHECK-GI-NEXT: umov w6, v19.b[1] ; CHECK-GI-NEXT: umov w7, v19.b[5] -; CHECK-GI-NEXT: umov w19, v19.b[9] ; CHECK-GI-NEXT: mov v7.s[1], w9 ; CHECK-GI-NEXT: mov v16.s[1], w14 +; CHECK-GI-NEXT: umov w19, v19.b[9] ; CHECK-GI-NEXT: umov w20, v19.b[13] -; CHECK-GI-NEXT: fmov s6, wzr ; CHECK-GI-NEXT: umov w12, v19.b[2] ; CHECK-GI-NEXT: umov w8, v19.b[3] ; CHECK-GI-NEXT: mov v17.s[2], w3 @@ -5164,13 +5164,13 @@ define i32 @test_sdot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b ; CHECK-GI-NEXT: .cfi_offset w30, -88 ; CHECK-GI-NEXT: .cfi_offset w29, -96 ; CHECK-GI-NEXT: ldp q7, q16, [x1] -; CHECK-GI-NEXT: fmov s1, wzr +; CHECK-GI-NEXT: movi d1, #0000000000000000 ; CHECK-GI-NEXT: str w2, [sp, #12] // 4-byte Folded Spill -; CHECK-GI-NEXT: fmov s3, wzr -; CHECK-GI-NEXT: fmov s2, wzr -; CHECK-GI-NEXT: fmov s5, wzr -; CHECK-GI-NEXT: fmov s4, wzr -; CHECK-GI-NEXT: fmov s6, wzr +; CHECK-GI-NEXT: movi d3, #0000000000000000 +; CHECK-GI-NEXT: movi d2, #0000000000000000 +; CHECK-GI-NEXT: movi d5, #0000000000000000 +; CHECK-GI-NEXT: movi d4, #0000000000000000 +; CHECK-GI-NEXT: movi d6, #0000000000000000 ; CHECK-GI-NEXT: mov b19, v7.b[3] ; CHECK-GI-NEXT: mov b23, v7.b[7] ; CHECK-GI-NEXT: mov b17, v7.b[1] @@ -5454,7 +5454,7 @@ define i32 @test_sdot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b ; CHECK-GI-NEXT: smov w8, v20.h[7] ; CHECK-GI-NEXT: sxth w9, w9 ; CHECK-GI-NEXT: mov v16.s[1], w12 -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: fmov s19, w15 ; CHECK-GI-NEXT: smov w15, v22.h[6] ; CHECK-GI-NEXT: mov v1.s[1], wzr @@ -5900,28 +5900,28 @@ define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33 ; CHECK-GI-NEXT: sbfx w15, w15, #8, #8 ; CHECK-GI-NEXT: mov v23.h[2], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #112] -; CHECK-GI-NEXT: fmov s19, wzr +; CHECK-GI-NEXT: movi d19, #0000000000000000 ; CHECK-GI-NEXT: sbfx w9, w9, #8, #8 ; CHECK-GI-NEXT: sbfx w11, w11, #8, #8 -; CHECK-GI-NEXT: fmov s21, wzr +; CHECK-GI-NEXT: movi d21, #0000000000000000 ; CHECK-GI-NEXT: mov v22.h[3], w10 ; CHECK-GI-NEXT: ldr w10, [sp, #144] ; CHECK-GI-NEXT: lsl w8, w8, #8 -; CHECK-GI-NEXT: fmov s16, wzr -; CHECK-GI-NEXT: fmov s18, wzr -; CHECK-GI-NEXT: fmov s17, wzr +; CHECK-GI-NEXT: movi d16, #0000000000000000 +; CHECK-GI-NEXT: movi d18, #0000000000000000 +; CHECK-GI-NEXT: movi d17, #0000000000000000 ; CHECK-GI-NEXT: lsl w10, w10, #8 ; CHECK-GI-NEXT: mov v23.h[3], w9 ; CHECK-GI-NEXT: sbfx w8, w8, #8, #8 ; CHECK-GI-NEXT: ldr w9, [sp, #120] -; CHECK-GI-NEXT: fmov s20, wzr -; CHECK-GI-NEXT: fmov s6, wzr +; CHECK-GI-NEXT: movi d20, #0000000000000000 +; CHECK-GI-NEXT: movi d6, #0000000000000000 ; CHECK-GI-NEXT: sbfx w10, w10, #8, #8 ; CHECK-GI-NEXT: mov v22.h[4], w11 ; CHECK-GI-NEXT: lsl w11, w5, #8 ; CHECK-GI-NEXT: lsl w9, w9, #8 -; CHECK-GI-NEXT: fmov s7, wzr -; CHECK-GI-NEXT: fmov s2, wzr +; CHECK-GI-NEXT: movi d7, #0000000000000000 +; CHECK-GI-NEXT: movi d2, #0000000000000000 ; CHECK-GI-NEXT: fmov s24, w10 ; CHECK-GI-NEXT: mov v23.h[4], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #160] @@ -5929,8 +5929,8 @@ define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33 ; CHECK-GI-NEXT: ldr w10, [sp, #168] ; CHECK-GI-NEXT: sbfx w9, w9, #8, #8 ; CHECK-GI-NEXT: lsl w8, w8, #8 -; CHECK-GI-NEXT: fmov s4, wzr -; CHECK-GI-NEXT: fmov s3, wzr +; CHECK-GI-NEXT: movi d4, #0000000000000000 +; CHECK-GI-NEXT: movi d3, #0000000000000000 ; CHECK-GI-NEXT: mov v24.h[1], w12 ; CHECK-GI-NEXT: lsl w12, w6, #8 ; CHECK-GI-NEXT: mov v22.h[5], w11 @@ -5941,8 +5941,8 @@ define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33 ; CHECK-GI-NEXT: ldr w11, [sp, #184] ; CHECK-GI-NEXT: ldr w9, [sp, #192] ; CHECK-GI-NEXT: sbfx w10, w10, #8, #8 -; CHECK-GI-NEXT: fmov s5, wzr -; CHECK-GI-NEXT: fmov s1, wzr +; CHECK-GI-NEXT: movi d5, #0000000000000000 +; CHECK-GI-NEXT: movi d1, #0000000000000000 ; CHECK-GI-NEXT: mov v24.h[2], w8 ; CHECK-GI-NEXT: mov v22.h[6], w12 ; CHECK-GI-NEXT: ldr w12, [sp, #208] @@ -5951,7 +5951,7 @@ define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33 ; CHECK-GI-NEXT: lsl w9, w9, #8 ; CHECK-GI-NEXT: lsl w12, w12, #8 ; CHECK-GI-NEXT: ldr w8, [sp, #200] -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: lsl w13, w13, #8 ; CHECK-GI-NEXT: sbfx w9, w9, #8, #8 ; CHECK-GI-NEXT: mov v19.s[1], wzr @@ -6813,10 +6813,10 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> % ; CHECK-GI-NEXT: fmov s23, w12 ; CHECK-GI-NEXT: sxtb w10, w10 ; CHECK-GI-NEXT: sxtb w12, w7 -; CHECK-GI-NEXT: fmov s18, wzr +; CHECK-GI-NEXT: movi d18, #0000000000000000 ; CHECK-GI-NEXT: sxtb w8, w8 -; CHECK-GI-NEXT: fmov s19, wzr -; CHECK-GI-NEXT: fmov s20, wzr +; CHECK-GI-NEXT: movi d19, #0000000000000000 +; CHECK-GI-NEXT: movi d20, #0000000000000000 ; CHECK-GI-NEXT: mov v22.s[1], w9 ; CHECK-GI-NEXT: sxtb w9, w2 ; CHECK-GI-NEXT: mov v23.s[1], w13 @@ -6825,10 +6825,10 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> % ; CHECK-GI-NEXT: sxtb w11, w6 ; CHECK-GI-NEXT: ldr w13, [sp, #232] ; CHECK-GI-NEXT: mov v18.s[1], wzr -; CHECK-GI-NEXT: mov v19.s[1], wzr +; CHECK-GI-NEXT: movi d21, #0000000000000000 ; CHECK-GI-NEXT: fmov s25, w8 ; CHECK-GI-NEXT: ldr w8, [sp, #80] -; CHECK-GI-NEXT: fmov s21, wzr +; CHECK-GI-NEXT: mov v19.s[1], wzr ; CHECK-GI-NEXT: mov v22.s[2], w9 ; CHECK-GI-NEXT: mov v24.s[1], w10 ; CHECK-GI-NEXT: sxtb w10, w3 @@ -6837,10 +6837,10 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> % ; CHECK-GI-NEXT: sxtb w8, w8 ; CHECK-GI-NEXT: ldr w11, [sp, #136] ; CHECK-GI-NEXT: mov v18.s[2], wzr -; CHECK-GI-NEXT: mov v19.s[2], wzr +; CHECK-GI-NEXT: movi d6, #0000000000000000 ; CHECK-GI-NEXT: sxtb w9, w9 -; CHECK-GI-NEXT: fmov s6, wzr -; CHECK-GI-NEXT: fmov s7, wzr +; CHECK-GI-NEXT: mov v19.s[2], wzr +; CHECK-GI-NEXT: movi d7, #0000000000000000 ; CHECK-GI-NEXT: mov v22.s[3], w10 ; CHECK-GI-NEXT: ldr w10, [sp, #128] ; CHECK-GI-NEXT: mov v24.s[2], w8 @@ -6855,7 +6855,7 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> % ; CHECK-GI-NEXT: sxtb w8, w8 ; CHECK-GI-NEXT: fmov s26, w10 ; CHECK-GI-NEXT: ldr w10, [sp, #144] -; CHECK-GI-NEXT: mov v18.s[3], wzr +; CHECK-GI-NEXT: movi d5, #0000000000000000 ; CHECK-GI-NEXT: mov v25.s[2], w9 ; CHECK-GI-NEXT: ldr w9, [sp, #120] ; CHECK-GI-NEXT: sxtb w12, w12 @@ -6872,14 +6872,14 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> % ; CHECK-GI-NEXT: ldr w9, [sp, #192] ; CHECK-GI-NEXT: add v22.4s, v22.4s, v23.4s ; CHECK-GI-NEXT: mov v27.s[1], w8 -; CHECK-GI-NEXT: mov v19.s[3], wzr -; CHECK-GI-NEXT: fmov s5, wzr +; CHECK-GI-NEXT: movi d16, #0000000000000000 +; CHECK-GI-NEXT: movi d17, #0000000000000000 ; CHECK-GI-NEXT: mov v26.s[2], w10 ; CHECK-GI-NEXT: ldr w10, [sp, #200] ; CHECK-GI-NEXT: sxtb w9, w9 -; CHECK-GI-NEXT: fmov s16, wzr -; CHECK-GI-NEXT: fmov s17, wzr -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 +; CHECK-GI-NEXT: movi d1, #0000000000000000 +; CHECK-GI-NEXT: movi d3, #0000000000000000 ; CHECK-GI-NEXT: sxtb w8, w10 ; CHECK-GI-NEXT: sxtb w10, w12 ; CHECK-GI-NEXT: fmov s28, w9 @@ -6936,7 +6936,7 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> % ; CHECK-GI-NEXT: fmov s9, w12 ; CHECK-GI-NEXT: sxtb w11, w11 ; CHECK-GI-NEXT: sxtb w10, w10 -; CHECK-GI-NEXT: fmov s1, wzr +; CHECK-GI-NEXT: movi d2, #0000000000000000 ; CHECK-GI-NEXT: sxtb w9, w9 ; CHECK-GI-NEXT: mov v30.s[3], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #632] @@ -6948,10 +6948,10 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> % ; CHECK-GI-NEXT: ldr w10, [sp, #688] ; CHECK-GI-NEXT: sxtb w11, w11 ; CHECK-GI-NEXT: sxtb w8, w8 -; CHECK-GI-NEXT: fmov s3, wzr +; CHECK-GI-NEXT: movi d4, #0000000000000000 ; CHECK-GI-NEXT: sxtb w9, w9 ; CHECK-GI-NEXT: sxtb w10, w10 -; CHECK-GI-NEXT: fmov s2, wzr +; CHECK-GI-NEXT: mov v18.s[3], wzr ; CHECK-GI-NEXT: mov v9.s[2], w11 ; CHECK-GI-NEXT: ldr w11, [sp, #664] ; CHECK-GI-NEXT: mov v10.s[1], w8 @@ -6963,7 +6963,7 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> % ; CHECK-GI-NEXT: ldr w10, [sp, #672] ; CHECK-GI-NEXT: sxtb w8, w8 ; CHECK-GI-NEXT: sxtb w9, w9 -; CHECK-GI-NEXT: fmov s4, wzr +; CHECK-GI-NEXT: mov v19.s[3], wzr ; CHECK-GI-NEXT: mov v11.s[1], w11 ; CHECK-GI-NEXT: sxtb w10, w10 ; CHECK-GI-NEXT: mov v20.s[1], wzr @@ -7121,15 +7121,15 @@ define i32 @test_udot_v48i8(ptr nocapture readonly %a, ptr nocapture readonly %b ; ; CHECK-GI-LABEL: test_udot_v48i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 ; CHECK-GI-NEXT: ldr q7, [x0, #32] ; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 ; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 ; CHECK-GI-NEXT: ldr q17, [x1, #32] ; CHECK-GI-NEXT: ldp q4, q5, [x0] -; CHECK-GI-NEXT: mov v0.s[1], wzr ; CHECK-GI-NEXT: ldp q6, q16, [x1] +; CHECK-GI-NEXT: mov v0.s[1], wzr ; CHECK-GI-NEXT: udot v2.4s, v17.16b, v7.16b ; CHECK-GI-NEXT: udot v1.4s, v6.16b, v4.16b ; CHECK-GI-NEXT: udot v3.4s, v16.16b, v5.16b @@ -7169,7 +7169,7 @@ define i32 @test_udot_v48i8_nomla(ptr nocapture readonly %a1) { ; ; CHECK-GI-LABEL: test_udot_v48i8_nomla: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: movi v1.16b, #1 ; CHECK-GI-NEXT: ldr q7, [x0, #32] ; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 @@ -7212,15 +7212,15 @@ define i32 @test_sdot_v48i8(ptr nocapture readonly %a, ptr nocapture readonly %b ; ; CHECK-GI-LABEL: test_sdot_v48i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 ; CHECK-GI-NEXT: ldr q7, [x0, #32] ; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 ; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 ; CHECK-GI-NEXT: ldr q17, [x1, #32] ; CHECK-GI-NEXT: ldp q4, q5, [x0] -; CHECK-GI-NEXT: mov v0.s[1], wzr ; CHECK-GI-NEXT: ldp q6, q16, [x1] +; CHECK-GI-NEXT: mov v0.s[1], wzr ; CHECK-GI-NEXT: sdot v2.4s, v17.16b, v7.16b ; CHECK-GI-NEXT: sdot v1.4s, v6.16b, v4.16b ; CHECK-GI-NEXT: sdot v3.4s, v16.16b, v5.16b @@ -7639,7 +7639,7 @@ define i32 @test_sdot_v48i8_double(<48 x i8> %a, <48 x i8> %b, <48 x i8> %c, <48 ; CHECK-GI-NEXT: fmov s2, w0 ; CHECK-GI-NEXT: ldr w11, [sp, #208] ; CHECK-GI-NEXT: ldr w8, [sp, #216] -; CHECK-GI-NEXT: fmov s1, wzr +; CHECK-GI-NEXT: movi d1, #0000000000000000 ; CHECK-GI-NEXT: fmov s3, w10 ; CHECK-GI-NEXT: ldr w10, [sp, #336] ; CHECK-GI-NEXT: ldr w12, [sp, #720] @@ -7663,7 +7663,7 @@ define i32 @test_sdot_v48i8_double(<48 x i8> %a, <48 x i8> %b, <48 x i8> %c, <48 ; CHECK-GI-NEXT: ldr w11, [sp, #16] ; CHECK-GI-NEXT: mov v7.b[1], w9 ; CHECK-GI-NEXT: ldr w9, [sp, #480] -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: mov v6.b[1], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #96] ; CHECK-GI-NEXT: mov v4.b[2], w10 @@ -8271,7 +8271,7 @@ define i32 @test_sdot_v48i8_double_nomla(<48 x i8> %a, <48 x i8> %b, <48 x i8> % ; CHECK-GI-NEXT: fmov s2, w0 ; CHECK-GI-NEXT: ldr w10, [sp, #216] ; CHECK-GI-NEXT: ldr w12, [sp, #848] -; CHECK-GI-NEXT: fmov s1, wzr +; CHECK-GI-NEXT: movi d1, #0000000000000000 ; CHECK-GI-NEXT: fmov s4, w9 ; CHECK-GI-NEXT: fmov s3, w11 ; CHECK-GI-NEXT: ldr w11, [sp, #720] @@ -8295,7 +8295,7 @@ define i32 @test_sdot_v48i8_double_nomla(<48 x i8> %a, <48 x i8> %b, <48 x i8> % ; CHECK-GI-NEXT: mov v2.b[2], w2 ; CHECK-GI-NEXT: mov v3.b[2], w10 ; CHECK-GI-NEXT: ldr w10, [sp, #864] -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: mov v7.b[1], w11 ; CHECK-GI-NEXT: ldr w11, [sp, #992] ; CHECK-GI-NEXT: mov v4.b[2], w8 diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll index d6d323530946e..25702ef25510c 100644 --- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll +++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll @@ -851,15 +851,15 @@ define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) { ; CHECK-GI-NEXT: sub sp, sp, #16 ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 ; CHECK-GI-NEXT: fmov s1, w0 -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: fmov s2, w4 ; CHECK-GI-NEXT: mov.s v1[1], w1 ; CHECK-GI-NEXT: mov.s v2[1], w5 ; CHECK-GI-NEXT: mov.s v0[1], wzr ; CHECK-GI-NEXT: mov.s v1[2], w2 ; CHECK-GI-NEXT: cmeq.4s v0, v2, v0 -; CHECK-GI-NEXT: mvn.16b v0, v0 ; CHECK-GI-NEXT: mov.s v1[3], w3 +; CHECK-GI-NEXT: mvn.16b v0, v0 ; CHECK-GI-NEXT: cmtst.4s v1, v1, v1 ; CHECK-GI-NEXT: mov.s w8, v1[1] ; CHECK-GI-NEXT: mov.s w9, v1[2] diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll index fb504028a161b..ee04e41d55046 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll @@ -2127,15 +2127,15 @@ define i32 @test_udot_v48i8(ptr %p1, ptr %p2) { ; ; CHECK-GI-DOT-LABEL: test_udot_v48i8: ; CHECK-GI-DOT: // %bb.0: // %entry -; CHECK-GI-DOT-NEXT: fmov s0, wzr +; CHECK-GI-DOT-NEXT: movi d0, #0000000000000000 ; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000 ; CHECK-GI-DOT-NEXT: ldr q7, [x0, #32] ; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000 ; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000 ; CHECK-GI-DOT-NEXT: ldr q17, [x1, #32] ; CHECK-GI-DOT-NEXT: ldp q4, q5, [x0] -; CHECK-GI-DOT-NEXT: mov v0.s[1], wzr ; CHECK-GI-DOT-NEXT: ldp q6, q16, [x1] +; CHECK-GI-DOT-NEXT: mov v0.s[1], wzr ; CHECK-GI-DOT-NEXT: udot v2.4s, v17.16b, v7.16b ; CHECK-GI-DOT-NEXT: udot v1.4s, v6.16b, v4.16b ; CHECK-GI-DOT-NEXT: udot v3.4s, v16.16b, v5.16b @@ -2395,15 +2395,15 @@ define i32 @test_sdot_v48i8(ptr %p1, ptr %p2) { ; ; CHECK-GI-DOT-LABEL: test_sdot_v48i8: ; CHECK-GI-DOT: // %bb.0: // %entry -; CHECK-GI-DOT-NEXT: fmov s0, wzr +; CHECK-GI-DOT-NEXT: movi d0, #0000000000000000 ; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000 ; CHECK-GI-DOT-NEXT: ldr q7, [x0, #32] ; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000 ; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000 ; CHECK-GI-DOT-NEXT: ldr q17, [x1, #32] ; CHECK-GI-DOT-NEXT: ldp q4, q5, [x0] -; CHECK-GI-DOT-NEXT: mov v0.s[1], wzr ; CHECK-GI-DOT-NEXT: ldp q6, q16, [x1] +; CHECK-GI-DOT-NEXT: mov v0.s[1], wzr ; CHECK-GI-DOT-NEXT: sdot v2.4s, v17.16b, v7.16b ; CHECK-GI-DOT-NEXT: sdot v1.4s, v6.16b, v4.16b ; CHECK-GI-DOT-NEXT: sdot v3.4s, v16.16b, v5.16b From 8e1d54a22ab99c790e31883a3126cab8a30f7840 Mon Sep 17 00:00:00 2001 From: tomershafir Date: Tue, 9 Sep 2025 20:46:36 +0300 Subject: [PATCH 2/3] canonicalize to FMOV?0 in AArch64InstrInfo to be printed by AArch64AsmPrinter --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 35 +++----------------- 1 file changed, 4 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index da5e2ec02d649..e56fe90259d5c 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -5471,20 +5471,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Copies between GPR64 and FPR64. if (AArch64::FPR64RegClass.contains(DestReg) && AArch64::GPR64RegClass.contains(SrcReg)) { - if (AArch64::XZR == SrcReg && - !Subtarget.hasZeroCycleZeroingFPWorkaround() && - Subtarget.isNeonAvailable()) { - if (Subtarget.hasZeroCycleZeroingFPR64()) { - BuildMI(MBB, I, DL, get(AArch64::MOVID), DestReg).addImm(0); - } else if (Subtarget.hasZeroCycleZeroingFPR128()) { - const TargetRegisterInfo *TRI = &getRegisterInfo(); - MCRegister DestRegQ = TRI->getMatchingSuperReg( - DestReg, AArch64::dsub, &AArch64::FPR128RegClass); - BuildMI(MBB, I, DL, get(AArch64::MOVIv2d_ns), DestRegQ).addImm(0); - } else { - BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - } + if (AArch64::XZR == SrcReg) { + BuildMI(MBB, I, DL, get(AArch64::FMOVD0), DestReg); } else { BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); @@ -5500,23 +5488,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Copies between GPR32 and FPR32. if (AArch64::FPR32RegClass.contains(DestReg) && AArch64::GPR32RegClass.contains(SrcReg)) { - if (AArch64::WZR == SrcReg && - !Subtarget.hasZeroCycleZeroingFPWorkaround() && - Subtarget.isNeonAvailable()) { - if (Subtarget.hasZeroCycleZeroingFPR64()) { - const TargetRegisterInfo *TRI = &getRegisterInfo(); - MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::ssub, - &AArch64::FPR64RegClass); - BuildMI(MBB, I, DL, get(AArch64::MOVID), DestRegD).addImm(0); - } else if (Subtarget.hasZeroCycleZeroingFPR128()) { - const TargetRegisterInfo *TRI = &getRegisterInfo(); - MCRegister DestRegQ = TRI->getMatchingSuperReg( - DestReg, AArch64::ssub, &AArch64::FPR128RegClass); - BuildMI(MBB, I, DL, get(AArch64::MOVIv2d_ns), DestRegQ).addImm(0); - } else { - BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - } + if (AArch64::WZR == SrcReg) { + BuildMI(MBB, I, DL, get(AArch64::FMOVS0), DestReg); } else { BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); From 29b37b4d8aa5185321784690e2baf5bb298d5900 Mon Sep 17 00:00:00 2001 From: tomershafir Date: Wed, 10 Sep 2025 03:59:52 +0300 Subject: [PATCH 3/3] fix autogenerated tests --- .../AArch64/sve-streaming-mode-fixed-length-masked-load.ll | 2 +- .../AArch64/sve-streaming-mode-fixed-length-masked-store.ll | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll index 42b947604b860..1fa4b5f62bdec 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll @@ -1466,8 +1466,8 @@ define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) { define <2 x half> @masked_load_v2f16(ptr %src, <2 x i1> %mask) { ; CHECK-LABEL: masked_load_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: fmov s1, wzr +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov z2.s, z0.s[1] ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: zip1 z0.h, z0.h, z2.h diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll index 9b3da75be47ec..8f4a696a28d62 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll @@ -589,8 +589,8 @@ define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) { define void @masked_store_v2f16(ptr %dst, <2 x i1> %mask) { ; CHECK-LABEL: masked_store_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: fmov s1, wzr +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov z2.s, z0.s[1] ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: zip1 z0.h, z0.h, z2.h