Revert "[AArch64][Machine-Combiner] Split gather patterns into neon regs to multiple vectors (#142941)" #150505

jcohen-apple · 2025-07-24T19:48:20Z

Reverting due to reported miscompiles, will reland once it is fixed.

…egs to multiple vectors (#142941)" Reverting due to reported miscompiles, will reland with a fix.

llvmbot · 2025-07-24T19:48:57Z

@llvm/pr-subscribers-backend-aarch64

Author: Jonathan Cohen (jcohen-apple)

Changes

Reverting due to reported miscompiles, will reland once it is fixed.

Patch is 67.56 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/150505.diff

10 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64InstrInfo.cpp (-265)
(modified) llvm/lib/Target/AArch64/AArch64InstrInfo.h (-4)
(removed) llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir (-364)
(modified) llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll (+65-69)
(modified) llvm/test/CodeGen/AArch64/concat-vector.ll (+2-3)
(modified) llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll (+24-26)
(modified) llvm/test/CodeGen/AArch64/fsh.ll (+56-57)
(modified) llvm/test/CodeGen/AArch64/llvm.frexp.ll (+6-8)
(modified) llvm/test/CodeGen/AArch64/neon-dotreduce.ll (+170-175)
(modified) llvm/test/CodeGen/AArch64/nontemporal.ll (+23-25)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 802e4a973cdc3..8685d7a04ac9c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -20,7 +20,6 @@
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/CFIInstBuilder.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
@@ -36,7 +35,6 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/StackMaps.h"
-#include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -7354,9 +7352,6 @@ bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
   case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
-  case AArch64MachineCombinerPattern::GATHER_LANE_i32:
-  case AArch64MachineCombinerPattern::GATHER_LANE_i16:
-  case AArch64MachineCombinerPattern::GATHER_LANE_i8:
     return true;
   } // end switch (Pattern)
   return false;
@@ -7397,252 +7392,11 @@ static bool getMiscPatterns(MachineInstr &Root,
   return false;
 }
 
-static bool getGatherPattern(MachineInstr &Root,
-                             SmallVectorImpl<unsigned> &Patterns,
-                             unsigned LoadLaneOpCode, unsigned NumLanes) {
-  const MachineFunction *MF = Root.getMF();
-
-  // Early exit if optimizing for size.
-  if (MF->getFunction().hasMinSize())
-    return false;
-
-  const MachineRegisterInfo &MRI = MF->getRegInfo();
-  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
-
-  // The root of the pattern must load into the last lane of the vector.
-  if (Root.getOperand(2).getImm() != NumLanes - 1)
-    return false;
-
-  // Check that we have load into all lanes except lane 0.
-  // For each load we also want to check that:
-  // 1. It has a single non-debug use (since we will be replacing the virtual
-  // register)
-  // 2. That the addressing mode only uses a single offset register.
-  auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
-  auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
-  SmallSet<unsigned, 4> RemainingLanes(Range.begin(), Range.end());
-  while (!RemainingLanes.empty() && CurrInstr &&
-         CurrInstr->getOpcode() == LoadLaneOpCode &&
-         MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
-         CurrInstr->getNumOperands() == 4) {
-    RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
-    CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
-  }
-
-  if (!RemainingLanes.empty())
-    return false;
-
-  // Match the SUBREG_TO_REG sequence.
-  if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
-    return false;
-
-  // Verify that the subreg to reg loads an integer into the first lane.
-  auto Lane0LoadReg = CurrInstr->getOperand(2).getReg();
-  unsigned SingleLaneSizeInBits = 128 / NumLanes;
-  if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
-    return false;
-
-  // Verify that it also has a single non debug use.
-  if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
-    return false;
-
-  switch (NumLanes) {
-  case 4:
-    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32);
-    break;
-  case 8:
-    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i16);
-    break;
-  case 16:
-    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i8);
-    break;
-  default:
-    llvm_unreachable("Got bad number of lanes for gather pattern.");
-  }
-
-  return true;
-}
-
-/// Search for patterns where we use LD1 instructions to load into
-/// separate lanes of an 128 bit Neon register. We can increase Memory Level
-/// Parallelism by loading into 2 Neon registers instead.
-static bool getLoadPatterns(MachineInstr &Root,
-                            SmallVectorImpl<unsigned> &Patterns) {
-
-  // The pattern searches for loads into single lanes.
-  switch (Root.getOpcode()) {
-  case AArch64::LD1i32:
-    return getGatherPattern(Root, Patterns, Root.getOpcode(), 4);
-  case AArch64::LD1i16:
-    return getGatherPattern(Root, Patterns, Root.getOpcode(), 8);
-  case AArch64::LD1i8:
-    return getGatherPattern(Root, Patterns, Root.getOpcode(), 16);
-  default:
-    return false;
-  }
-}
-
-static void
-generateGatherPattern(MachineInstr &Root,
-                      SmallVectorImpl<MachineInstr *> &InsInstrs,
-                      SmallVectorImpl<MachineInstr *> &DelInstrs,
-                      DenseMap<Register, unsigned> &InstrIdxForVirtReg,
-                      unsigned Pattern, unsigned NumLanes) {
-
-  MachineFunction &MF = *Root.getParent()->getParent();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-
-  // Gather the initial load instructions to build the pattern
-  SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
-  MachineInstr *CurrInstr = &Root;
-  for (unsigned i = 0; i < NumLanes - 1; ++i) {
-    LoadToLaneInstrs.push_back(CurrInstr);
-    CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
-  }
-
-  // Sort the load instructions according to the lane.
-  llvm::sort(LoadToLaneInstrs,
-             [](const MachineInstr *A, const MachineInstr *B) {
-               return A->getOperand(2).getImm() > B->getOperand(2).getImm();
-             });
-
-  MachineInstr *SubregToReg = CurrInstr;
-  LoadToLaneInstrs.push_back(
-      MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg()));
-  auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
-
-  const TargetRegisterClass *FPR128RegClass =
-      MRI.getRegClass(Root.getOperand(0).getReg());
-
-  auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr,
-                                Register SrcRegister, unsigned Lane,
-                                Register OffsetRegister) {
-    auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
-    MachineInstrBuilder LoadIndexIntoRegister =
-        BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
-                NewRegister)
-            .addReg(SrcRegister)
-            .addImm(Lane)
-            .addReg(OffsetRegister, getKillRegState(true));
-    InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
-    InsInstrs.push_back(LoadIndexIntoRegister);
-    return NewRegister;
-  };
-
-  // Helper to create load instruction based on opcode
-  auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg,
-                                   Register OffsetReg) -> MachineInstrBuilder {
-    unsigned Opcode;
-    switch (NumLanes) {
-    case 4:
-      Opcode = AArch64::LDRSui;
-      break;
-    case 8:
-      Opcode = AArch64::LDRHui;
-      break;
-    case 16:
-      Opcode = AArch64::LDRBui;
-      break;
-    default:
-      llvm_unreachable(
-          "Got unsupported number of lanes in machine-combiner gather pattern");
-    }
-    // Immediate offset load
-    return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
-        .addReg(OffsetReg)
-        .addImm(0); // immediate offset
-  };
-
-  // Load the remaining lanes into register 0.
-  auto LanesToLoadToReg0 =
-      llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
-                       LoadToLaneInstrsAscending.begin() + NumLanes / 2);
-  auto PrevReg = SubregToReg->getOperand(0).getReg();
-  for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
-    PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1,
-                                 LoadInstr->getOperand(3).getReg());
-    DelInstrs.push_back(LoadInstr);
-  }
-  auto LastLoadReg0 = PrevReg;
-
-  // First load into register 1. Perform a LDRSui to zero out the upper lanes in
-  // a single instruction.
-  auto Lane0Load = *LoadToLaneInstrsAscending.begin();
-  auto OriginalSplitLoad =
-      *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
-  auto DestRegForMiddleIndex = MRI.createVirtualRegister(
-      MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
-
-  MachineInstrBuilder MiddleIndexLoadInstr =
-      CreateLoadInstruction(NumLanes, DestRegForMiddleIndex,
-                            OriginalSplitLoad->getOperand(3).getReg());
-
-  InstrIdxForVirtReg.insert(
-      std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
-  InsInstrs.push_back(MiddleIndexLoadInstr);
-  DelInstrs.push_back(OriginalSplitLoad);
-
-  // Subreg To Reg instruction for register 1.
-  auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
-  unsigned SubregType;
-  switch (NumLanes) {
-  case 4:
-    SubregType = AArch64::ssub;
-    break;
-  case 8:
-    SubregType = AArch64::hsub;
-    break;
-  case 16:
-    SubregType = AArch64::bsub;
-    break;
-  default:
-    llvm_unreachable(
-        "Got invalid NumLanes for machine-combiner gather pattern");
-  }
-
-  auto SubRegToRegInstr =
-      BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
-              DestRegForSubregToReg)
-          .addImm(0)
-          .addReg(DestRegForMiddleIndex, getKillRegState(true))
-          .addImm(SubregType);
-  InstrIdxForVirtReg.insert(
-      std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
-  InsInstrs.push_back(SubRegToRegInstr);
-
-  // Load remaining lanes into register 1.
-  auto LanesToLoadToReg1 =
-      llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
-                       LoadToLaneInstrsAscending.end());
-  PrevReg = SubRegToRegInstr->getOperand(0).getReg();
-  for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
-    PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1,
-                                 LoadInstr->getOperand(3).getReg());
-    if (Index == NumLanes / 2 - 2) {
-      break;
-    }
-    DelInstrs.push_back(LoadInstr);
-  }
-  auto LastLoadReg1 = PrevReg;
-
-  // Create the final zip instruction to combine the results.
-  MachineInstrBuilder ZipInstr =
-      BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
-              Root.getOperand(0).getReg())
-          .addReg(LastLoadReg0)
-          .addReg(LastLoadReg1);
-  InsInstrs.push_back(ZipInstr);
-}
-
 CombinerObjective
 AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
   switch (Pattern) {
   case AArch64MachineCombinerPattern::SUBADD_OP1:
   case AArch64MachineCombinerPattern::SUBADD_OP2:
-  case AArch64MachineCombinerPattern::GATHER_LANE_i32:
-  case AArch64MachineCombinerPattern::GATHER_LANE_i16:
-  case AArch64MachineCombinerPattern::GATHER_LANE_i8:
     return CombinerObjective::MustReduceDepth;
   default:
     return TargetInstrInfo::getCombinerObjective(Pattern);
@@ -7672,10 +7426,6 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
   if (getMiscPatterns(Root, Patterns))
     return true;
 
-  // Load patterns
-  if (getLoadPatterns(Root, Patterns))
-    return true;
-
   return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
                                                      DoRegPressureReduce);
 }
@@ -8931,21 +8681,6 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
     break;
   }
-  case AArch64MachineCombinerPattern::GATHER_LANE_i32: {
-    generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
-                          Pattern, 4);
-    break;
-  }
-  case AArch64MachineCombinerPattern::GATHER_LANE_i16: {
-    generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
-                          Pattern, 8);
-    break;
-  }
-  case AArch64MachineCombinerPattern::GATHER_LANE_i8: {
-    generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
-                          Pattern, 16);
-    break;
-  }
 
   } // end switch (Pattern)
   // Record MUL and ADD/SUB for deletion
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 02734866e7122..7c255da333e4b 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -172,10 +172,6 @@ enum AArch64MachineCombinerPattern : unsigned {
   FMULv8i16_indexed_OP2,
 
   FNMADD,
-
-  GATHER_LANE_i32,
-  GATHER_LANE_i16,
-  GATHER_LANE_i8
 };
 class AArch64InstrInfo final : public AArch64GenInstrInfo {
   const AArch64RegisterInfo RI;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir
deleted file mode 100644
index 09eb18b0e3574..0000000000000
--- a/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir
+++ /dev/null
@@ -1,364 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -run-pass=machine-combiner -mcpu=neoverse-n2 -mtriple=aarch64-none-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s
-
----
-name:            split_loads_to_fpr128
-body:             |
-  bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3, $x4
-
-    ; CHECK-LABEL: name: split_loads_to_fpr128
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
-    ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1
-    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub
-    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY2]] 
-    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY3]], 0
-    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub
-    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY4]]
-    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]]
-    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
-    ; CHECK-NEXT: RET_ReallyLR implicit $q0
-    %0:gpr64common = COPY $x0
-    %1:gpr64common = COPY $x1
-    %2:gpr64common = COPY $x2
-    %3:gpr64common = COPY $x3
-    %4:gpr64common = COPY $x4
-    %5:fpr32 = LDRSroX %0, killed %1, 0, 1
-    %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub
-    %7:fpr128 = LD1i32 %6, 1, killed %2
-    %8:fpr128 = LD1i32 %7, 2, killed %3
-    %9:fpr128 = LD1i32 %8, 3, killed %4
-    $q0 = COPY %9
-    RET_ReallyLR implicit $q0
-
----
-name:            split_loads_to_fpr128_ui
-body:             |
-  bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3, $x4
-
-    ; CHECK-LABEL: name: split_loads_to_fpr128_ui
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
-    ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSui [[COPY]], 0
-    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub
-    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY1]] 
-    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY2]], 0
-    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub
-    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY3]]
-    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]]
-    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
-    ; CHECK-NEXT: RET_ReallyLR implicit $q0
-    %0:gpr64common = COPY $x0
-    %1:gpr64common = COPY $x1
-    %2:gpr64common = COPY $x2
-    %3:gpr64common = COPY $x3
-    %4:gpr64common = COPY $x4
-    %5:fpr32 = LDRSui %0, 0
-    %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub
-    %7:fpr128 = LD1i32 %6, 1, killed %1
-    %8:fpr128 = LD1i32 %7, 2, killed %2
-    %9:fpr128 = LD1i32 %8, 3, killed %3
-    $q0 = COPY %9
-    RET_ReallyLR implicit $q0
-
----
-name:            split_loads_to_fpr128_i16
-body:             |
-  bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8
-
-    ; CHECK-LABEL: name: split_loads_to_fpr128_i16
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5
-    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6
-    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7
-    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8
-    ; CHECK-NEXT: [[LD_i16:%[0-9]+]]:fpr16 = LDRHroX [[COPY]], killed [[COPY1]], 0, 1
-    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i16]], %subreg.hsub
-    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i16 [[FIRST_REG]], 1, killed [[COPY2]]
-    ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i16 [[LD0_1]], 2, killed [[COPY3]]
-    ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i16 [[LD0_2]], 3, killed [[COPY4]]
-    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr16 = LDRHui [[COPY5]], 0
-    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.hsub
-    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i16 [[SECOND_REG]], 1, killed [[COPY6]]
-    ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i16 [[LD1_1]], 2, killed [[COPY7]]
-    ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i16 [[LD1_2]], 3, killed [[COPY8]]
-    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_3]], [[LD1_3]]
-    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
-    ; CHECK-NEXT: RET_ReallyLR implicit $q0
-    %0:gpr64common = COPY $x0
-    %1:gpr64common = COPY $x1
-    %2:gpr64common = COPY $x2
-    %3:gpr64common = COPY $x3
-    %4:gpr64common = COPY $x4
-    %5:gpr64common = COPY $x5
-    %6:gpr64common = COPY $x6
-    %7:gpr64common = COPY $x7
-    %8:gpr64common = COPY $x8
-    %9:fpr16 = LDRHroX %0, killed %1, 0, 1
-    %10:fpr128 = SUBREG_TO_REG 0, killed %9, %subreg.hsub
-    %11:fpr128 = LD1i16 %10, 1, killed %2
-    %12:fpr128 = LD1i16 %11, 2, killed %3
-    %13:fpr128 = LD1i16 %12, 3, killed %4
-    %14:fpr128 = LD1i16 %13, 4, killed %5
-    %15:fpr128 = LD1i16 %14, 5, killed %6
-    %16:fpr128 = LD1i16 %15, 6, killed %7
-    %17:fpr128 = LD1i16 %16, 7, killed %8
-    $q0 = COPY %17
-    RET_ReallyLR implicit $q0
-
----
-name:            split_loads_to_fpr128_i16_ui
-body:             |
-  bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8
-
-    ; CHECK-LABEL: name: split_loads_to_fpr128_i16_ui
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5
-    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6
-    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7
-    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8
-    ; CHECK-NEXT: [[LD_i16:%[0-9]+]]:fpr16 = LDRHui [[COPY]], 0
-    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i16]], %subreg.hsub
-    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i16 [[FIRST_REG]], 1, killed [[COPY1]]
-    ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i16 [[LD0_1]], 2, killed [[COPY2]]
-    ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i16 [[LD0_2]], 3, killed [[COPY3]]
-    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr16 = LDRHui [[COPY4]], 0
-    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.hsub
-    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i16 [[SECOND_REG]], 1, killed [[COPY5]]
-    ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i16 [[LD1_1]], 2, killed [[COPY6]]
-    ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i16 [[LD1_2]], 3, killed [[COPY7]]
-    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_3]], [[LD1_3]]
-    ; CH...
[truncated]

davemgreen

LGTM, thanks for doing this. (And reverts don't usually need review unless they might be contentious).

fhahn

LGTM, thanks

…egs to multiple vectors (llvm#142941)" (llvm#150505) Reverting due to reported miscompiles, will reland once it is fixed.

Revert "[AArch64][Machine-Combiner] Split gather patterns into neon r…

d73a62a

…egs to multiple vectors (#142941)" Reverting due to reported miscompiles, will reland with a fix.

jcohen-apple requested review from davemgreen and fhahn July 24, 2025 19:48

llvmbot added the backend:AArch64 label Jul 24, 2025

davemgreen approved these changes Jul 24, 2025

View reviewed changes

fhahn approved these changes Jul 25, 2025

View reviewed changes

fhahn merged commit 81bbe98 into main Jul 25, 2025
11 checks passed

fhahn deleted the users/jcohen-apple/gather-opt-revert branch July 25, 2025 13:09

kawashima-fj mentioned this pull request Aug 5, 2025

[AArch64] Incorrect load after #142941 #150004

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Revert "[AArch64][Machine-Combiner] Split gather patterns into neon regs to multiple vectors (#142941)" #150505

Revert "[AArch64][Machine-Combiner] Split gather patterns into neon regs to multiple vectors (#142941)" #150505

Uh oh!

jcohen-apple commented Jul 24, 2025

Uh oh!

llvmbot commented Jul 24, 2025

Uh oh!

davemgreen left a comment

Uh oh!

fhahn left a comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants

Revert "[AArch64][Machine-Combiner] Split gather patterns into neon regs to multiple vectors (#142941)" #150505

Revert "[AArch64][Machine-Combiner] Split gather patterns into neon regs to multiple vectors (#142941)" #150505

Uh oh!

Conversation

jcohen-apple commented Jul 24, 2025

Uh oh!

llvmbot commented Jul 24, 2025

Uh oh!

davemgreen left a comment

Choose a reason for hiding this comment

Uh oh!

fhahn left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants