llvm · davemgreen · Dec 2, 2025 · Nov 26, 2025 · Nov 26, 2025 · Nov 26, 2025
diff --git a/llvm/include/llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h b/llvm/include/llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h
@@ -0,0 +1,57 @@
+//===- ARMCommonInstCombineIntrinsic.h -
+// instCombineIntrinsic opts for both ARM and AArch64 -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains optimizations for ARM and AArch64 intrinsics that
+/// are shared between both architectures. These functions can be called from:
+/// - ARM TTI's instCombineIntrinsic (for arm_neon_* intrinsics)
+/// - AArch64 TTI's instCombineIntrinsic (for aarch64_neon_* and aarch64_sve_*
+///   intrinsics)
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_ARMCOMMONINSTCOMBINEINTRINSIC_H
+#define LLVM_TRANSFORMS_UTILS_ARMCOMMONINSTCOMBINEINTRINSIC_H
+
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
+
+namespace llvm {
+
+namespace ARMCommon {
+
+/// Convert a table lookup to shufflevector if the mask is constant.
+/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
+/// which case we could lower the shufflevector with rev64 instructions
+/// as it's actually a byte reverse.
+Instruction *simplifyNeonTbl1(IntrinsicInst &II, InstCombiner &IC);
+
+/// Simplify NEON multiply-long intrinsics (smull, umull).
+/// These intrinsics perform widening multiplies: they multiply two vectors of
+/// narrow integers and produce a vector of wider integers. This function
+/// performs algebraic simplifications:
+/// 1. Multiply by zero => zero vector
+/// 2. Multiply by one => zero/sign-extend the non-one operand
+/// 3. Both operands constant => regular multiply that can be constant-folded
+///    later
+Instruction *simplifyNeonMultiply(IntrinsicInst &II, InstCombiner &IC,
+                                  bool IsSigned);
+
+/// Simplify AES encryption/decryption intrinsics (AESE, AESD).
+///
+/// ARM's AES instructions (AESE/AESD) XOR the data and the key, provided as
+/// separate arguments, before performing the encryption/decryption operation.
+/// We can fold that "internal" XOR with a previous one.
+Instruction *simplifyAES(IntrinsicInst &II, InstCombiner &IC);
+
+} // namespace ARMCommon
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_UTILS_ARMCOMMONINSTCOMBINEINTRINSIC_H
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/TargetParser/AArch64TargetParser.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
+#include "llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 #include <algorithm>
@@ -2856,6 +2857,18 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
   case Intrinsic::aarch64_neon_fmaxnm:
   case Intrinsic::aarch64_neon_fminnm:
     return instCombineMaxMinNM(IC, II);
+  case Intrinsic::aarch64_neon_tbl1:
+    return ARMCommon::simplifyNeonTbl1(II, IC);
+  case Intrinsic::aarch64_neon_smull:
+  case Intrinsic::aarch64_neon_umull: {
+    bool IsSigned = IID == Intrinsic::aarch64_neon_smull;
+    return ARMCommon::simplifyNeonMultiply(II, IC, IsSigned);
+  }
+  case Intrinsic::aarch64_crypto_aesd:
+  case Intrinsic::aarch64_crypto_aese:
+  case Intrinsic::aarch64_sve_aesd:
+  case Intrinsic::aarch64_sve_aese:
+    return ARMCommon::simplifyAES(II, IC);
   case Intrinsic::aarch64_sve_convert_from_svbool:
     return instCombineConvertFromSVBool(IC, II);
   case Intrinsic::aarch64_sve_dup:

diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -31,6 +31,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/TargetParser/SubtargetFeature.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
+#include "llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
@@ -182,6 +183,19 @@ ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     break;
   }
 
+  case Intrinsic::arm_neon_vtbl1:
+    return ARMCommon::simplifyNeonTbl1(II, IC);
+
+  case Intrinsic::arm_neon_vmulls:
+  case Intrinsic::arm_neon_vmullu: {
+    bool IsSigned = IID == Intrinsic::arm_neon_vmulls;
+    return ARMCommon::simplifyNeonMultiply(II, IC, IsSigned);
+  }
+
+  case Intrinsic::arm_neon_aesd:
+  case Intrinsic::arm_neon_aese:
+    return ARMCommon::simplifyAES(II, IC);
+
   case Intrinsic::arm_mve_pred_i2v: {
     Value *Arg = II.getArgOperand(0);
     Value *ArgArg;

@@ -737,44 +737,6 @@ static Instruction *foldCtpop(IntrinsicInst &II, InstCombinerImpl &IC) {
   return nullptr;
 }
 
-/// Convert a table lookup to shufflevector if the mask is constant.
-/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
-/// which case we could lower the shufflevector with rev64 instructions
-/// as it's actually a byte reverse.
-static Value *simplifyNeonTbl1(const IntrinsicInst &II,
-                               InstCombiner::BuilderTy &Builder) {
-  // Bail out if the mask is not a constant.
-  auto *C = dyn_cast<Constant>(II.getArgOperand(1));
-  if (!C)
-    return nullptr;
-
-  auto *VecTy = cast<FixedVectorType>(II.getType());
-  unsigned NumElts = VecTy->getNumElements();
-
-  // Only perform this transformation for <8 x i8> vector types.
-  if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8)
-    return nullptr;
-
-  int Indexes[8];
-
-  for (unsigned I = 0; I < NumElts; ++I) {
-    Constant *COp = C->getAggregateElement(I);
-
-    if (!COp || !isa<ConstantInt>(COp))
-      return nullptr;
-
-    Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue();
-
-    // Make sure the mask indices are in range.
-    if ((unsigned)Indexes[I] >= NumElts)
-      return nullptr;
-  }
-
-  auto *V1 = II.getArgOperand(0);
-  auto *V2 = Constant::getNullValue(V1->getType());
-  return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes));
-}
-
 // Returns true iff the 2 intrinsics have the same operands, limiting the
 // comparison to the first NumOperands.
 static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E,
@@ -3155,72 +3117,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
         Intrinsic::getOrInsertDeclaration(II->getModule(), NewIntrin);
     return CallInst::Create(NewFn, CallArgs);
   }
-  case Intrinsic::arm_neon_vtbl1:
-  case Intrinsic::aarch64_neon_tbl1:
-    if (Value *V = simplifyNeonTbl1(*II, Builder))
-      return replaceInstUsesWith(*II, V);
-    break;
-
-  case Intrinsic::arm_neon_vmulls:
-  case Intrinsic::arm_neon_vmullu:
-  case Intrinsic::aarch64_neon_smull:
-  case Intrinsic::aarch64_neon_umull: {
-    Value *Arg0 = II->getArgOperand(0);
-    Value *Arg1 = II->getArgOperand(1);
-
-    // Handle mul by zero first:
-    if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) {
-      return replaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType()));
-    }
-
-    // Check for constant LHS & RHS - in this case we just simplify.
-    bool Zext = (IID == Intrinsic::arm_neon_vmullu ||
-                 IID == Intrinsic::aarch64_neon_umull);
-    VectorType *NewVT = cast<VectorType>(II->getType());
-    if (Constant *CV0 = dyn_cast<Constant>(Arg0)) {
-      if (Constant *CV1 = dyn_cast<Constant>(Arg1)) {
-        Value *V0 = Builder.CreateIntCast(CV0, NewVT, /*isSigned=*/!Zext);
-        Value *V1 = Builder.CreateIntCast(CV1, NewVT, /*isSigned=*/!Zext);
-        return replaceInstUsesWith(CI, Builder.CreateMul(V0, V1));
-      }
-
-      // Couldn't simplify - canonicalize constant to the RHS.
-      std::swap(Arg0, Arg1);
-    }
-
-    // Handle mul by one:
-    if (Constant *CV1 = dyn_cast<Constant>(Arg1))
-      if (ConstantInt *Splat =
-              dyn_cast_or_null<ConstantInt>(CV1->getSplatValue()))
-        if (Splat->isOne())
-          return CastInst::CreateIntegerCast(Arg0, II->getType(),
-                                             /*isSigned=*/!Zext);
-
-    break;
-  }
-  case Intrinsic::arm_neon_aesd:
-  case Intrinsic::arm_neon_aese:
-  case Intrinsic::aarch64_crypto_aesd:
-  case Intrinsic::aarch64_crypto_aese:
-  case Intrinsic::aarch64_sve_aesd:
-  case Intrinsic::aarch64_sve_aese: {
-    Value *DataArg = II->getArgOperand(0);
-    Value *KeyArg  = II->getArgOperand(1);
-
-    // Accept zero on either operand.
-    if (!match(KeyArg, m_ZeroInt()))
-      std::swap(KeyArg, DataArg);
-
-    // Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR
-    Value *Data, *Key;
-    if (match(KeyArg, m_ZeroInt()) &&
-        match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) {
-      replaceOperand(*II, 0, Data);
-      replaceOperand(*II, 1, Key);
-      return II;
-    }
-    break;
-  }
   case Intrinsic::hexagon_V6_vandvrt:
   case Intrinsic::hexagon_V6_vandvrt_128B: {
     // Simplify Q -> V -> Q conversion.

diff --git a/llvm/lib/Transforms/Utils/ARMCommonInstCombineIntrinsic.cpp b/llvm/lib/Transforms/Utils/ARMCommonInstCombineIntrinsic.cpp
@@ -0,0 +1,136 @@
+//===- ARMCommonInstCombineIntrinsic.cpp -
+//                  instCombineIntrinsic opts for both ARM and AArch64  ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains optimizations for ARM and AArch64 intrinsics that
+/// are shared between both architectures. These functions can be called from:
+/// - ARM TTI's instCombineIntrinsic (for arm_neon_* intrinsics)
+/// - AArch64 TTI's instCombineIntrinsic (for aarch64_neon_* and aarch64_sve_*
+///   intrinsics)
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+namespace llvm {
+namespace ARMCommon {
+
+/// Convert a table lookup to shufflevector if the mask is constant.
+/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
+/// which case we could lower the shufflevector with rev64 instructions
+/// as it's actually a byte reverse.
+Instruction *simplifyNeonTbl1(IntrinsicInst &II, InstCombiner &IC) {
+  // Bail out if the mask is not a constant.
+  auto *C = dyn_cast<Constant>(II.getArgOperand(1));
+  if (!C)
+    return nullptr;
+
+  auto *VecTy = cast<FixedVectorType>(II.getType());
+  unsigned NumElts = VecTy->getNumElements();
+
+  // Only perform this transformation for <8 x i8> vector types.
+  if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8)
+    return nullptr;
+
+  int Indexes[8];
+
+  for (unsigned I = 0; I < NumElts; ++I) {
+    Constant *COp = C->getAggregateElement(I);
+
+    if (!COp || !isa<ConstantInt>(COp))
+      return nullptr;
+
+    Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue();
+
+    // Make sure the mask indices are in range.
+    if ((unsigned)Indexes[I] >= NumElts)
+      return nullptr;
+  }
+
+  auto *V1 = II.getArgOperand(0);
+  auto *V2 = Constant::getNullValue(V1->getType());
+  Value *Shuf = IC.Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes));
+  return IC.replaceInstUsesWith(II, Shuf);
+}
+
+/// Simplify NEON multiply-long intrinsics (smull, umull).
+/// These intrinsics perform widening multiplies: they multiply two vectors of
+/// narrow integers and produce a vector of wider integers. This function
+/// performs algebraic simplifications:
+/// 1. Multiply by zero => zero vector
+/// 2. Multiply by one => zero/sign-extend the non-one operand
+/// 3. Both operands constant => regular multiply that can be constant-folded
+///    later
+Instruction *simplifyNeonMultiply(IntrinsicInst &II, InstCombiner &IC,
+                                  bool IsSigned) {
+  Value *Arg0 = II.getArgOperand(0);
+  Value *Arg1 = II.getArgOperand(1);
+
+  // Handle mul by zero first:
+  if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) {
+    return IC.replaceInstUsesWith(II, ConstantAggregateZero::get(II.getType()));
+  }
+
+  // Check for constant LHS & RHS - in this case we just simplify.
+  VectorType *NewVT = cast<VectorType>(II.getType());
+  if (Constant *CV0 = dyn_cast<Constant>(Arg0)) {
+    if (Constant *CV1 = dyn_cast<Constant>(Arg1)) {
+      Value *V0 = IC.Builder.CreateIntCast(CV0, NewVT, IsSigned);
+      Value *V1 = IC.Builder.CreateIntCast(CV1, NewVT, IsSigned);
+      return IC.replaceInstUsesWith(II, IC.Builder.CreateMul(V0, V1));
+    }
+
+    // Couldn't simplify - canonicalize constant to the RHS.
+    std::swap(Arg0, Arg1);
+  }
+
+  // Handle mul by one:
+  if (Constant *CV1 = dyn_cast<Constant>(Arg1))
+    if (ConstantInt *Splat =
+            dyn_cast_or_null<ConstantInt>(CV1->getSplatValue()))
+      if (Splat->isOne())
+        return CastInst::CreateIntegerCast(Arg0, II.getType(), IsSigned);
+
+  return nullptr;
+}
+
+/// Simplify AES encryption/decryption intrinsics (AESE, AESD).
+///
+/// ARM's AES instructions (AESE/AESD) XOR the data and the key, provided as
+/// separate arguments, before performing the encryption/decryption operation.
+/// We can fold that "internal" XOR with a previous one.
+Instruction *simplifyAES(IntrinsicInst &II, InstCombiner &IC) {
+  Value *DataArg = II.getArgOperand(0);
+  Value *KeyArg = II.getArgOperand(1);
+
+  // Accept zero on either operand.
+  if (!match(KeyArg, m_ZeroInt()))
+    std::swap(KeyArg, DataArg);
+
+  // Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR
+  Value *Data, *Key;
+  if (match(KeyArg, m_ZeroInt()) &&
+      match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) {
+    IC.replaceOperand(II, 0, Data);
+    IC.replaceOperand(II, 1, Key);
+    return &II;
+  }
+
+  return nullptr;
+}
+
+} // namespace ARMCommon
+} // namespace llvm
diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_llvm_component_library(LLVMTransformUtils
   AddDiscriminators.cpp
   AMDGPUEmitPrintf.cpp
+  ARMCommonInstCombineIntrinsic.cpp
   ASanStackFrameLayout.cpp
   AssumeBundleBuilder.cpp
   BasicBlockUtils.cpp

diff --git a/llvm/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll b/llvm/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+; RUN: opt --mtriple=aarch64 -S -passes=instcombine < %s | FileCheck %s
 ; ARM64 AES intrinsic variants
 
 define <16 x i8> @combineXorAeseZeroARM64(<16 x i8> %data, <16 x i8> %key) {

diff --git a/llvm/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll b/llvm/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+; RUN: opt -mtriple=arm -S -passes=instcombine < %s | FileCheck %s
 
 define <4 x i32> @mulByZero(<4 x i16> %x) nounwind readnone ssp {
 ; CHECK-LABEL: define <4 x i32> @mulByZero(

diff --git a/llvm/test/Transforms/InstCombine/ARM/aes-intrinsics.ll b/llvm/test/Transforms/InstCombine/ARM/aes-intrinsics.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+; RUN: opt -mtriple=arm -S -passes=instcombine < %s | FileCheck %s
 ; ARM AES intrinsic variants
 
 define <16 x i8> @combineXorAeseZeroARM(<16 x i8> %data, <16 x i8> %key) {