From 936497abb984b6b2dc442eb92ffc705d5b3dbff7 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham@arm.com>
Date: Wed, 15 Oct 2025 16:59:20 +0100
Subject: [PATCH 1/3] [ARM][MVE] Invalid tail predication in LowOverheadLoop
 pass

When a loop is converted into a low-overhead loop using tail
predication via FPSCR.LTPSIZE, the MQPRCopy pseudo-instruction is
expanded into either two VMOVD or a single MVE_VORR, depending on
whether the values written to the lanes with a 'false' predicate
matter. (MVE_VORR uses the ambient LTPSIZE predicate, so it won't
write those lanes at all; the double VMOVD is slower but gets them
right.)

This check was done based on whether the output of the MQPRCopy is
live coming out of the loop. But it missed a case where the live-out
value is not _itself_ an MQPRCopy, but is a predicated operation
taking its false lanes from an MQPRCopy.

Fixes #162644, and adds a new MIR test case derived from the
reproducer in that bug.
---
 llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp   |  15 ++
 llvm/lib/Target/ARM/Thumb2InstrInfo.cpp       |  13 +
 llvm/lib/Target/ARM/Thumb2InstrInfo.h         |   3 +
 .../vctp-vs-unpredicated-copy.mir             | 231 ++++++++++++++++++
 4 files changed, 262 insertions(+)
 create mode 100644 llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-vs-unpredicated-copy.mir
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 96ee69cf3f4ce..d0666e8fa4b00 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -1036,6 +1036,7 @@ bool LowOverheadLoop::ValidateLiveOuts() {
   while (!Worklist.empty()) {
     MachineInstr *MI = Worklist.pop_back_val();
     if (MI->getOpcode() == ARM::MQPRCopy) {
+      LLVM_DEBUG(dbgs() << " Must generate copy as VMOV: " << *MI);
       VMOVCopies.insert(MI);
       MachineInstr *CopySrc =
           RDI.getUniqueReachingMIDef(MI, MI->getOperand(1).getReg());
@@ -1045,6 +1046,20 @@ bool LowOverheadLoop::ValidateLiveOuts() {
       LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI);
       VMOVCopies.clear();
       return false;
+    } else if (isVectorPredicated(MI)) {
+      // If this is a predicated instruction with merging semantics,
+      // check where it gets its false lanes from, if any.
+      int InactiveIdx = findVPTInactiveOperandIdx(*MI);
+      if (InactiveIdx != -1) {
+        SmallPtrSet<MachineInstr *, 2> Defs;
+        MachineInstr *FalseSrc = RDI.getUniqueReachingMIDef(
+            MI, MI->getOperand(InactiveIdx).getReg());
+        if (FalseSrc) {
+          LLVM_DEBUG(dbgs()
+                     << " Must check source of false lanes for: " << *MI);
+          Worklist.push_back(FalseSrc);
+        }
+      }
     }
   }
 
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index 431ce38ad6e99..cb7c6a466aad4 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -805,6 +805,19 @@ int llvm::findFirstVPTPredOperandIdx(const MachineInstr &MI) {
   return -1;
 }
 
+int llvm::findVPTInactiveOperandIdx(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+
+  for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i)
+    if (MCID.operands()[i].OperandType == ARM::OPERAND_VPRED_R) {
+      assert(MCID.getOperandConstraint(i + 3, MCOI::TIED_TO) != -1 &&
+             "Operand #3 of VPRED_R is the one tied to the output register");
+      return i + 3;
+    }
+
+  return -1;
+}
+
 ARMVCC::VPTCodes llvm::getVPTInstrPredicate(const MachineInstr &MI,
                                             Register &PredReg) {
   int PIdx = findFirstVPTPredOperandIdx(MI);
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
index 3ec3a6216b9f6..1b0bf2d499510 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
@@ -90,6 +90,9 @@ inline ARMVCC::VPTCodes getVPTInstrPredicate(const MachineInstr &MI) {
   Register PredReg;
   return getVPTInstrPredicate(MI, PredReg);
 }
+// Identify the input operand in an MVE predicated instruction which
+// contributes the values of any inactive vector lanes.
+int findVPTInactiveOperandIdx(const MachineInstr &MI);
 
 // Recomputes the Block Mask of Instr, a VPT or VPST instruction.
 // This rebuilds the block mask of the instruction depending on the predicates
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-vs-unpredicated-copy.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-vs-unpredicated-copy.mir
new file mode 100644
index 0000000000000..15cabf244c1c8
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-vs-unpredicated-copy.mir
@@ -0,0 +1,231 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -verify-machineinstrs -o - | FileCheck %s
+
+# The _wrong_ output of this test is to generate the body of the
+# tail-predicated loop like this:
+#
+#     $q2 = MVE_VORR killed $q0, killed $q0, 0, $noreg, $noreg, undef $q2
+#     renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg, renamable $lr :: (load unknown-size from %ir.13, align 4)
+#     $q0 = MVE_VORR $q1, $q1, 0, $noreg, $noreg, undef $q0
+#     renamable $q0 = MVE_VADDf32 killed renamable $q2, killed renamable $q3, 0, killed $noreg, renamable $lr, killed renamable $q0
+#     $lr = MVE_LETP killed renamable $lr, %bb.1
+#
+# in which the second MVE_VORR, copying q1 into q0, is an invalid conversion of
+# the input MQPRCopy, because it won't copy the vector lanes disabled by
+# FPSCR.LTPSIZE, and those are needed in the output value of the loop.
+#
+# In the right output, that MQPRCopy is expanded into a pair of VMOVD copying
+# d2,d3 into d0,d1 respectively, which are unaffected by LTPSIZE.
+
+--- |
+  ; ModuleID = '162644.c'
+  source_filename = "162644.c"
+  target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv8.1m.main-unknown-none-eabihf"
+
+  @inactive = dso_local local_unnamed_addr global <4 x float> zeroinitializer, align 16
+
+  ; Function Attrs: nofree noinline norecurse nosync nounwind memory(read, inaccessiblemem: none)
+  define dso_local <4 x float> @test_func(ptr noundef readonly captures(none) %0, i32 noundef %1) local_unnamed_addr #0 {
+    %3 = load <4 x float>, ptr @inactive, align 16, !tbaa !3
+    %4 = add i32 %1, 3
+    %5 = call i32 @llvm.smin.i32(i32 %1, i32 4)
+    %6 = sub i32 %4, %5
+    %7 = lshr i32 %6, 2
+    %8 = add nuw nsw i32 %7, 1
+    %9 = call i32 @llvm.start.loop.iterations.i32(i32 %8)
+    br label %10
+
+  10:                                               ; preds = %10, %2
+    %11 = phi <4 x float> [ splat (float 0x3FB99999A0000000), %2 ], [ %17, %10 ]
+    %12 = phi i32 [ %1, %2 ], [ %19, %10 ]
+    %13 = phi ptr [ %0, %2 ], [ %18, %10 ]
+    %14 = phi i32 [ %9, %2 ], [ %20, %10 ]
+    %15 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %12)
+    %16 = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %13, i32 4, <4 x i1> %15, <4 x float> zeroinitializer)
+    %17 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %11, <4 x float> %16, <4 x i1> %15, <4 x float> %3)
+    %18 = getelementptr inbounds nuw i8, ptr %13, i32 16
+    %19 = add i32 %12, -4
+    %20 = call i32 @llvm.loop.decrement.reg.i32(i32 %14, i32 1)
+    %21 = icmp ne i32 %20, 0
+    br i1 %21, label %10, label %22, !llvm.loop !6
+
+  22:                                               ; preds = %10
+    ret <4 x float> %17
+  }
+
+  ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1
+
+  ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read)
+  declare <4 x float> @llvm.masked.load.v4f32.p0(ptr captures(none), i32 immarg, <4 x i1>, <4 x float>) #2
+
+  ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+  declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #1
+
+  ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+  declare i32 @llvm.smin.i32(i32, i32) #3
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare i32 @llvm.start.loop.iterations.i32(i32) #4
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #4
+
+  attributes #0 = { nofree noinline norecurse nosync nounwind memory(read, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m52" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+pacbti,+ras,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }
+  attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+  attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+  attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+  attributes #4 = { nocallback noduplicate nofree nosync nounwind willreturn }
+
+  !llvm.module.flags = !{!0, !1}
+  !llvm.ident = !{!2}
+
+  !0 = !{i32 1, !"wchar_size", i32 4}
+  !1 = !{i32 1, !"min_enum_size", i32 4}
+  !2 = !{!"clang version 22.0.0git"}
+  !3 = !{!4, !4, i64 0}
+  !4 = !{!"omnipotent char", !5, i64 0}
+  !5 = !{!"Simple C/C++ TBAA"}
+  !6 = distinct !{!6, !7, !8}
+  !7 = !{!"llvm.loop.mustprogress"}
+  !8 = !{!"llvm.loop.unroll.disable"}
+...
+---
+name:            test_func
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: true
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: true
+  localFrameSize:  0
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  isLRSpilled:     true
+body:             |
+  ; CHECK-LABEL: name: test_func
+  ; CHECK: bb.0 (%ir-block.2):
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r1, $r7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $r7, -8
+  ; CHECK-NEXT:   $r2 = t2MOVi16 target-flags(arm-lo16) @inactive, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   $r2 = t2MOVTi16 killed $r2, target-flags(arm-hi16) @inactive, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   renamable $q1 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg, $noreg :: (dereferenceable load (s128) from @inactive, !tbaa !3)
+  ; CHECK-NEXT:   $r3 = t2MOVi16 52429, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   $r3 = t2MOVTi16 killed $r3, 15820, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   renamable $q0 = MVE_VDUP32 killed renamable $r3, 0, $noreg, $noreg, undef renamable $q0
+  ; CHECK-NEXT:   $lr = MVE_DLSTP_32 killed renamable $r1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1 (%ir-block.10, align 4):
+  ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+  ; CHECK-NEXT:   liveins: $lr, $d2, $d3, $q0, $r0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $q2 = MVE_VORR killed $q0, killed $q0, 0, $noreg, $noreg, undef $q2
+  ; CHECK-NEXT:   renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg, renamable $lr :: (load unknown-size from %ir.13, align 4)
+  ; CHECK-NEXT:   $d0 = VMOVD $d2, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   $d1 = VMOVD $d3, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   renamable $q0 = MVE_VADDf32 killed renamable $q2, killed renamable $q3, 0, killed $noreg, renamable $lr, killed renamable $q0
+  ; CHECK-NEXT:   $lr = MVE_LETP killed renamable $lr, %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2 (%ir-block.22):
+  ; CHECK-NEXT:   liveins: $q0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $q0
+  bb.0 (%ir-block.2):
+    successors: %bb.1(0x80000000)
+    liveins: $r0, $r1, $r7, $lr
+
+    frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r7, -8
+    $r2 = t2MOVi16 target-flags(arm-lo16) @inactive, 14 /* CC::al */, $noreg
+    tCMPi8 renamable $r1, 4, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    $r2 = t2MOVTi16 killed $r2, target-flags(arm-hi16) @inactive, 14 /* CC::al */, $noreg
+    renamable $r3 = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
+    renamable $q1 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg, $noreg :: (dereferenceable load (s128) from @inactive, !tbaa !3)
+    $r2 = tMOVr $r1, 14 /* CC::al */, $noreg
+    t2IT 10, 8, implicit-def $itstate
+    renamable $r2 = tMOVi8 $noreg, 4, 10 /* CC::ge */, killed $cpsr, implicit killed renamable $r2, implicit killed $itstate
+    renamable $r2, dead $cpsr = tSUBrr renamable $r1, killed renamable $r2, 14 /* CC::al */, $noreg
+    renamable $r2, dead $cpsr = tADDi8 killed renamable $r2, 3, 14 /* CC::al */, $noreg
+    renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg
+    $r3 = t2MOVi16 52429, 14 /* CC::al */, $noreg
+    $r3 = t2MOVTi16 killed $r3, 15820, 14 /* CC::al */, $noreg
+    renamable $q0 = MVE_VDUP32 killed renamable $r3, 0, $noreg, $noreg, undef renamable $q0
+    renamable $lr = t2DoLoopStartTP killed renamable $r2, renamable $r1
+
+  bb.1 (%ir-block.10, align 4):
+    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+    liveins: $lr, $q0, $q1, $r0, $r1
+
+    renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg, $noreg
+    $q2 = MQPRCopy killed $q0
+    MVE_VPST 8, implicit $vpr
+    renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr, renamable $lr :: (load unknown-size from %ir.13, align 4)
+    $q0 = MQPRCopy $q1
+    MVE_VPST 8, implicit $vpr
+    renamable $q0 = MVE_VADDf32 killed renamable $q2, killed renamable $q3, 1, killed renamable $vpr, renamable $lr, killed renamable $q0
+    renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg
+    renamable $lr = t2LoopEndDec killed renamable $lr, %bb.1, implicit-def dead $cpsr
+    tB %bb.2, 14 /* CC::al */, $noreg
+
+  bb.2 (%ir-block.22):
+    liveins: $q0
+
+    frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $q0
+...

From 96985a08795828151a2a30fe7347b17f2e469875 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham@arm.com>
Date: Mon, 20 Oct 2025 11:59:33 +0100
Subject: [PATCH 2/3] Trim cruft from the new test case

---
 .../vctp-vs-unpredicated-copy.mir             | 99 ++-----------------
 1 file changed, 7 insertions(+), 92 deletions(-)

diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-vs-unpredicated-copy.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-vs-unpredicated-copy.mir
index 15cabf244c1c8..5783133132fd0 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-vs-unpredicated-copy.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-vs-unpredicated-copy.mir
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
 # RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -verify-machineinstrs -o - | FileCheck %s
 
-# The _wrong_ output of this test is to generate the body of the
-# tail-predicated loop like this:
+# From bug #162644. The _wrong_ output of this test is to generate the
+# body of the tail-predicated loop like this:
 #
 #     $q2 = MVE_VORR killed $q0, killed $q0, 0, $noreg, $noreg, undef $q2
 #     renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg, renamable $lr :: (load unknown-size from %ir.13, align 4)
@@ -18,16 +18,13 @@
 # d2,d3 into d0,d1 respectively, which are unaffected by LTPSIZE.
 
 --- |
-  ; ModuleID = '162644.c'
-  source_filename = "162644.c"
   target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
   target triple = "thumbv8.1m.main-unknown-none-eabihf"
 
   @inactive = dso_local local_unnamed_addr global <4 x float> zeroinitializer, align 16
 
-  ; Function Attrs: nofree noinline norecurse nosync nounwind memory(read, inaccessiblemem: none)
-  define dso_local <4 x float> @test_func(ptr noundef readonly captures(none) %0, i32 noundef %1) local_unnamed_addr #0 {
-    %3 = load <4 x float>, ptr @inactive, align 16, !tbaa !3
+  define <4 x float> @test_func(ptr %0, i32 %1) {
+    %3 = load <4 x float>, ptr @inactive, align 16
     %4 = add i32 %1, 3
     %5 = call i32 @llvm.smin.i32(i32 %1, i32 4)
     %6 = sub i32 %4, %5
@@ -48,97 +45,21 @@
     %19 = add i32 %12, -4
     %20 = call i32 @llvm.loop.decrement.reg.i32(i32 %14, i32 1)
     %21 = icmp ne i32 %20, 0
-    br i1 %21, label %10, label %22, !llvm.loop !6
+    br i1 %21, label %10, label %22
 
   22:                                               ; preds = %10
     ret <4 x float> %17
   }
-
-  ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
-  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1
-
-  ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read)
-  declare <4 x float> @llvm.masked.load.v4f32.p0(ptr captures(none), i32 immarg, <4 x i1>, <4 x float>) #2
-
-  ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
-  declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #1
-
-  ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-  declare i32 @llvm.smin.i32(i32, i32) #3
-
-  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
-  declare i32 @llvm.start.loop.iterations.i32(i32) #4
-
-  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
-  declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #4
-
-  attributes #0 = { nofree noinline norecurse nosync nounwind memory(read, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m52" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+pacbti,+ras,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }
-  attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
-  attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read) }
-  attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-  attributes #4 = { nocallback noduplicate nofree nosync nounwind willreturn }
-
-  !llvm.module.flags = !{!0, !1}
-  !llvm.ident = !{!2}
-
-  !0 = !{i32 1, !"wchar_size", i32 4}
-  !1 = !{i32 1, !"min_enum_size", i32 4}
-  !2 = !{!"clang version 22.0.0git"}
-  !3 = !{!4, !4, i64 0}
-  !4 = !{!"omnipotent char", !5, i64 0}
-  !5 = !{!"Simple C/C++ TBAA"}
-  !6 = distinct !{!6, !7, !8}
-  !7 = !{!"llvm.loop.mustprogress"}
-  !8 = !{!"llvm.loop.unroll.disable"}
 ...
 ---
 name:            test_func
 alignment:       4
-exposesReturnsTwice: false
 legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
 tracksRegLiveness: true
-hasWinCFI:       false
-noPhis:          true
-isSSA:           false
-noVRegs:         true
-hasFakeUses:     false
-callsEHReturn:   false
-callsUnwindInit: false
-hasEHContTarget: false
-hasEHScopes:     false
-hasEHFunclets:   false
-isOutlined:      false
-debugInstrRef:   false
-failsVerification: false
-tracksDebugUserValues: true
 registers:       []
 liveins:
   - { reg: '$r0', virtual-reg: '' }
   - { reg: '$r1', virtual-reg: '' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       8
-  offsetAdjustment: 0
-  maxAlignment:    4
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  functionContext: ''
-  maxCallFrameSize: 0
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  hasTailCall:     false
-  isCalleeSavedInfoValid: true
-  localFrameSize:  0
-fixedStack:      []
 stack:
   - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
       stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
@@ -146,12 +67,6 @@ stack:
   - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
       stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
       debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-entry_values:    []
-callSites:       []
-debugValueSubstitutions: []
-constants:       []
-machineFunctionInfo:
-  isLRSpilled:     true
 body:             |
   ; CHECK-LABEL: name: test_func
   ; CHECK: bb.0 (%ir-block.2):
@@ -164,7 +79,7 @@ body:             |
   ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $r7, -8
   ; CHECK-NEXT:   $r2 = t2MOVi16 target-flags(arm-lo16) @inactive, 14 /* CC::al */, $noreg
   ; CHECK-NEXT:   $r2 = t2MOVTi16 killed $r2, target-flags(arm-hi16) @inactive, 14 /* CC::al */, $noreg
-  ; CHECK-NEXT:   renamable $q1 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg, $noreg :: (dereferenceable load (s128) from @inactive, !tbaa !3)
+  ; CHECK-NEXT:   renamable $q1 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg, $noreg :: (dereferenceable load (s128) from @inactive)
   ; CHECK-NEXT:   $r3 = t2MOVi16 52429, 14 /* CC::al */, $noreg
   ; CHECK-NEXT:   $r3 = t2MOVTi16 killed $r3, 15820, 14 /* CC::al */, $noreg
   ; CHECK-NEXT:   renamable $q0 = MVE_VDUP32 killed renamable $r3, 0, $noreg, $noreg, undef renamable $q0
@@ -197,7 +112,7 @@ body:             |
     tCMPi8 renamable $r1, 4, 14 /* CC::al */, $noreg, implicit-def $cpsr
     $r2 = t2MOVTi16 killed $r2, target-flags(arm-hi16) @inactive, 14 /* CC::al */, $noreg
     renamable $r3 = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
-    renamable $q1 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg, $noreg :: (dereferenceable load (s128) from @inactive, !tbaa !3)
+    renamable $q1 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg, $noreg :: (dereferenceable load (s128) from @inactive)
     $r2 = tMOVr $r1, 14 /* CC::al */, $noreg
     t2IT 10, 8, implicit-def $itstate
     renamable $r2 = tMOVi8 $noreg, 4, 10 /* CC::ge */, killed $cpsr, implicit killed renamable $r2, implicit killed $itstate

From 87c892029c84aa209b7aa2b4fce7bae3608e610a Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham@arm.com>
Date: Tue, 21 Oct 2025 11:05:13 +0100
Subject: [PATCH 3/3] Use the new Tablegen sub-operand index constants

Both documents, and futureproofs, the offset from i to the operand we
want, so the assertion with its comment isn't needed any more.
---
 llvm/lib/Target/ARM/Thumb2InstrInfo.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index cb7c6a466aad4..f5653d459eac8 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -809,11 +809,8 @@ int llvm::findVPTInactiveOperandIdx(const MachineInstr &MI) {
   const MCInstrDesc &MCID = MI.getDesc();
 
   for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i)
-    if (MCID.operands()[i].OperandType == ARM::OPERAND_VPRED_R) {
-      assert(MCID.getOperandConstraint(i + 3, MCOI::TIED_TO) != -1 &&
-             "Operand #3 of VPRED_R is the one tied to the output register");
-      return i + 3;
-    }
+    if (MCID.operands()[i].OperandType == ARM::OPERAND_VPRED_R)
+      return i + ARM::SUBOP_vpred_r_inactive;
 
   return -1;
 }