diff --git a/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py b/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
index f1b934f7139e9..8741147a4f8a3 100755
--- a/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
+++ b/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
@@ -87,7 +87,7 @@ def find_compilation_database(path: str) -> str:
 
 
 def get_tidy_invocation(
-    f: str,
+    f: Optional[str],
     clang_tidy_binary: str,
     checks: str,
     tmpdir: Optional[str],
@@ -147,7 +147,8 @@ def get_tidy_invocation(
         start.append(f"--warnings-as-errors={warnings_as_errors}")
     if allow_no_checks:
         start.append("--allow-no-checks")
-    start.append(f)
+    if f:
+        start.append(f)
     return start
 
 
@@ -490,7 +491,7 @@ async def main() -> None:
 
     try:
         invocation = get_tidy_invocation(
-            "",
+            None,
             clang_tidy_binary,
             args.checks,
             None,
diff --git a/clang-tools-extra/clangd/InlayHints.cpp b/clang-tools-extra/clangd/InlayHints.cpp
index 1b1bcf78c9855..a2b856ad30519 100644
--- a/clang-tools-extra/clangd/InlayHints.cpp
+++ b/clang-tools-extra/clangd/InlayHints.cpp
@@ -33,6 +33,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/ADT/identity.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -368,7 +369,11 @@ static FunctionProtoTypeLoc getPrototypeLoc(Expr *Fn) {
   }
 
   if (auto F = Target.getAs<FunctionProtoTypeLoc>()) {
-    return F;
+    // In some edge cases the AST can contain a "trivial" FunctionProtoTypeLoc
+    // which has null parameters. Avoid these as they don't contain useful
+    // information.
+    if (llvm::all_of(F.getParams(), llvm::identity<ParmVarDecl *>()))
+      return F;
   }
 
   return {};
diff --git a/clang-tools-extra/clangd/ModulesBuilder.cpp b/clang-tools-extra/clangd/ModulesBuilder.cpp
index bee31fe51555e..2d2f0f6374486 100644
--- a/clang-tools-extra/clangd/ModulesBuilder.cpp
+++ b/clang-tools-extra/clangd/ModulesBuilder.cpp
@@ -360,9 +360,9 @@ void ModuleFileCache::remove(StringRef ModuleName) {
 /// Collect the directly and indirectly required module names for \param
 /// ModuleName in topological order. The \param ModuleName is guaranteed to
 /// be the last element in \param ModuleNames.
-llvm::SmallVector<StringRef> getAllRequiredModules(ProjectModules &MDB,
+llvm::SmallVector<std::string> getAllRequiredModules(ProjectModules &MDB,
                                                    StringRef ModuleName) {
-  llvm::SmallVector<llvm::StringRef> ModuleNames;
+  llvm::SmallVector<std::string> ModuleNames;
   llvm::StringSet<> ModuleNamesSet;
 
   auto VisitDeps = [&](StringRef ModuleName, auto Visitor) -> void {
@@ -373,7 +373,7 @@ llvm::SmallVector<StringRef> getAllRequiredModules(ProjectModules &MDB,
       if (ModuleNamesSet.insert(RequiredModuleName).second)
         Visitor(RequiredModuleName, Visitor);
 
-    ModuleNames.push_back(ModuleName);
+    ModuleNames.push_back(ModuleName.str());
   };
   VisitDeps(ModuleName, VisitDeps);
 
@@ -418,13 +418,13 @@ llvm::Error ModulesBuilder::ModulesBuilderImpl::getOrBuildModuleFile(
   // Get Required modules in topological order.
   auto ReqModuleNames = getAllRequiredModules(MDB, ModuleName);
   for (llvm::StringRef ReqModuleName : ReqModuleNames) {
-    if (BuiltModuleFiles.isModuleUnitBuilt(ModuleName))
+    if (BuiltModuleFiles.isModuleUnitBuilt(ReqModuleName))
       continue;
 
     if (auto Cached = Cache.getModule(ReqModuleName)) {
       if (IsModuleFileUpToDate(Cached->getModuleFilePath(), BuiltModuleFiles,
                                TFS.view(std::nullopt))) {
-        log("Reusing module {0} from {1}", ModuleName,
+        log("Reusing module {0} from {1}", ReqModuleName,
             Cached->getModuleFilePath());
         BuiltModuleFiles.addModuleFile(std::move(Cached));
         continue;
@@ -432,14 +432,16 @@ llvm::Error ModulesBuilder::ModulesBuilderImpl::getOrBuildModuleFile(
       Cache.remove(ReqModuleName);
     }
 
+    std::string ReqFileName =
+        MDB.getSourceForModuleName(ReqModuleName);
     llvm::Expected<ModuleFile> MF = buildModuleFile(
-        ModuleName, ModuleUnitFileName, getCDB(), TFS, BuiltModuleFiles);
+        ReqModuleName, ReqFileName, getCDB(), TFS, BuiltModuleFiles);
     if (llvm::Error Err = MF.takeError())
       return Err;
 
-    log("Built module {0} to {1}", ModuleName, MF->getModuleFilePath());
+    log("Built module {0} to {1}", ReqModuleName, MF->getModuleFilePath());
     auto BuiltModuleFile = std::make_shared<const ModuleFile>(std::move(*MF));
-    Cache.add(ModuleName, BuiltModuleFile);
+    Cache.add(ReqModuleName, BuiltModuleFile);
     BuiltModuleFiles.addModuleFile(std::move(BuiltModuleFile));
   }
 
diff --git a/clang-tools-extra/clangd/ProjectModules.h b/clang-tools-extra/clangd/ProjectModules.h
index 48d52ac9deb89..5296508e0584d 100644
--- a/clang-tools-extra/clangd/ProjectModules.h
+++ b/clang-tools-extra/clangd/ProjectModules.h
@@ -42,7 +42,7 @@ class ProjectModules {
       llvm::unique_function<void(tooling::CompileCommand &, PathRef) const>;
 
   virtual std::vector<std::string> getRequiredModules(PathRef File) = 0;
-  virtual PathRef
+  virtual std::string
   getSourceForModuleName(llvm::StringRef ModuleName,
                          PathRef RequiredSrcFile = PathRef()) = 0;
 
diff --git a/clang-tools-extra/clangd/ScanningProjectModules.cpp b/clang-tools-extra/clangd/ScanningProjectModules.cpp
index e4dc11c1c2895..859aba3673dc4 100644
--- a/clang-tools-extra/clangd/ScanningProjectModules.cpp
+++ b/clang-tools-extra/clangd/ScanningProjectModules.cpp
@@ -66,7 +66,7 @@ class ModuleDependencyScanner {
   ///
   /// TODO: We should handle the case that there are multiple source files
   /// declaring the same module.
-  PathRef getSourceForModuleName(llvm::StringRef ModuleName) const;
+  std::string getSourceForModuleName(llvm::StringRef ModuleName) const;
 
   /// Return the direct required modules. Indirect required modules are not
   /// included.
@@ -140,7 +140,7 @@ void ModuleDependencyScanner::globalScan(
   GlobalScanned = true;
 }
 
-PathRef ModuleDependencyScanner::getSourceForModuleName(
+std::string ModuleDependencyScanner::getSourceForModuleName(
     llvm::StringRef ModuleName) const {
   assert(
       GlobalScanned &&
@@ -189,7 +189,7 @@ class ScanningAllProjectModules : public ProjectModules {
 
   /// RequiredSourceFile is not used intentionally. See the comments of
   /// ModuleDependencyScanner for detail.
-  PathRef
+  std::string
   getSourceForModuleName(llvm::StringRef ModuleName,
                          PathRef RequiredSourceFile = PathRef()) override {
     Scanner.globalScan(Mangler);
diff --git a/clang-tools-extra/clangd/test/module_dependencies.test b/clang-tools-extra/clangd/test/module_dependencies.test
new file mode 100644
index 0000000000000..79306a73da435
--- /dev/null
+++ b/clang-tools-extra/clangd/test/module_dependencies.test
@@ -0,0 +1,95 @@
+# A smoke test to check that a simple dependency chain for modules can work.
+#
+# FIXME: The test fails on Windows; see comments on https://github.com/llvm/llvm-project/pull/142828
+# UNSUPPORTED: system-windows
+#
+# RUN: rm -fr %t
+# RUN: mkdir -p %t
+# RUN: split-file %s %t
+#
+# RUN: sed -e "s|DIR|%/t|g" %t/compile_commands.json.tmpl > %t/compile_commands.json.tmp
+# RUN: sed -e "s|CLANG_CC|%clang|g" %t/compile_commands.json.tmp > %t/compile_commands.json
+# RUN: sed -e "s|DIR|%/t|g" %t/definition.jsonrpc.tmpl > %t/definition.jsonrpc.tmp
+#
+# On Windows, we need the URI in didOpen to look like "uri":"file:///C:/..."
+# (with the extra slash in the front), so we add it here.
+# RUN: sed -E -e 's|"file://([A-Z]):/|"file:///\1:/|g' %/t/definition.jsonrpc.tmp > %/t/definition.jsonrpc
+#
+# RUN: clangd -experimental-modules-support -lit-test < %t/definition.jsonrpc \
+# RUN:      | FileCheck -strict-whitespace %t/definition.jsonrpc
+
+#--- A-frag.cppm
+export module A:frag;
+export void printA() {}
+
+#--- A.cppm
+export module A;
+export import :frag;
+
+#--- Use.cpp
+import A;
+void foo() {
+    print
+}
+
+#--- compile_commands.json.tmpl
+[
+    {
+      "directory": "DIR",
+      "command": "CLANG_CC -fprebuilt-module-path=DIR -std=c++20 -o DIR/main.cpp.o -c DIR/Use.cpp",
+      "file": "DIR/Use.cpp"
+    },
+    {
+      "directory": "DIR",
+      "command": "CLANG_CC -std=c++20 DIR/A.cppm --precompile -o DIR/A.pcm",
+      "file": "DIR/A.cppm"
+    },
+    {
+      "directory": "DIR",
+      "command": "CLANG_CC -std=c++20 DIR/A-frag.cppm --precompile -o DIR/A-frag.pcm",
+      "file": "DIR/A-frag.cppm"
+    }
+]
+
+#--- definition.jsonrpc.tmpl
+{
+  "jsonrpc": "2.0",
+  "id": 0,
+  "method": "initialize",
+  "params": {
+    "processId": 123,
+    "rootPath": "clangd",
+    "capabilities": {
+      "textDocument": {
+        "completion": {
+          "completionItem": {
+            "snippetSupport": true
+          }
+        }
+      }
+    },
+    "trace": "off"
+  }
+}
+---
+{
+  "jsonrpc": "2.0",
+  "method": "textDocument/didOpen",
+  "params": {
+    "textDocument": {
+      "uri": "file://DIR/Use.cpp",
+      "languageId": "cpp",
+      "version": 1,
+      "text": "import A;\nvoid foo() {\n    print\n}\n"
+    }
+  }
+}
+
+# CHECK: "message"{{.*}}printA{{.*}}(fix available)
+
+---
+{"jsonrpc":"2.0","id":1,"method":"textDocument/completion","params":{"textDocument":{"uri":"file://DIR/Use.cpp"},"context":{"triggerKind":1},"position":{"line":2,"character":6}}}
+---
+{"jsonrpc":"2.0","id":2,"method":"shutdown"}
+---
+{"jsonrpc":"2.0","method":"exit"}
diff --git a/clang-tools-extra/clangd/unittests/InlayHintTests.cpp b/clang-tools-extra/clangd/unittests/InlayHintTests.cpp
index 77d78b8777fe3..8ed8401f9fce9 100644
--- a/clang-tools-extra/clangd/unittests/InlayHintTests.cpp
+++ b/clang-tools-extra/clangd/unittests/InlayHintTests.cpp
@@ -997,11 +997,16 @@ TEST(ParameterHints, FunctionPointer) {
     f3_t f3;
     using f4_t = void(__stdcall *)(int param);
     f4_t f4;
+    __attribute__((noreturn)) f4_t f5;
     void bar() {
       f1($f1[[42]]);
       f2($f2[[42]]);
       f3($f3[[42]]);
       f4($f4[[42]]);
+      // This one runs into an edge case in clang's type model
+      // and we can't extract the parameter name. But at least
+      // we shouldn't crash.
+      f5(42);
     }
   )cpp",
       ExpectedHint{"param: ", "f1"}, ExpectedHint{"param: ", "f2"},
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 2ab597eb37048..0b2e9c5fabc36 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -190,6 +190,9 @@ Improvements to clang-tidy
 - Fixed bug in :program:`clang-tidy` by which `HeaderFilterRegex` did not take
   effect when passed via the `.clang-tidy` file.
 
+- Fixed bug in :program:`run_clang_tidy.py` where the program would not
+  correctly display the checks enabled by the top-level `.clang-tidy` file.
+
 New checks
 ^^^^^^^^^^
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index b8f26ec9a5447..262bf4e3d4f5b 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -314,6 +314,9 @@ C++2c Feature Support
 
 - Implemented `P3176R1 The Oxford variadic comma <https://wg21.link/P3176R1>`_
 
+- The error produced when doing arithmetic operations on enums of different types
+  can be disabled with ``-Wno-enum-enum-conversion``. (#GH92340)
+
 C++23 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 - Removed the restriction to literal types in constexpr functions in C++23 mode.
@@ -909,6 +912,8 @@ Bug Fixes in This Version
   being deleted has a potentially throwing destructor (#GH118660).
 - Clang now outputs correct values when #embed data contains bytes with negative
   signed char values (#GH102798).
+- Fix crash due to unknown references and pointer implementation and handling of
+  base classes. (GH139452)
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -1113,6 +1118,7 @@ Miscellaneous Clang Crashes Fixed
 
 - Fixed a crash when an unscoped enumeration declared by an opaque-enum-declaration within a class template
   with a dependent underlying type is subject to integral promotion. (#GH117960)
+- Fix code completion crash involving PCH serialzied templates. (#GH139019)
 
 OpenACC Specific Changes
 ------------------------
@@ -1267,6 +1273,8 @@ RISC-V Support
 - The option ``-mcmodel=large`` for the large code model is supported.
 - Bump RVV intrinsic to version 1.0, the spec: https://github.com/riscv-non-isa/rvv-intrinsic-doc/releases/tag/v1.0.0-rc4
 
+- `Zicsr` / `Zifencei` are allowed to be duplicated in the presence of `g` in `-march`.
+
 CUDA/HIP Language Changes
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 - Fixed a bug about overriding a constexpr pure-virtual member function with a non-constexpr virtual member function which causes compilation failure when including standard C++ header `format`.
@@ -1465,6 +1473,9 @@ Crash and bug fixes
 - The ``unix.BlockInCriticalSection`` now recognizes the ``lock()`` member function
   as expected, even if it's inherited from a base class. Fixes (#GH104241).
 
+- Fixed a crash when C++20 parenthesized initializer lists are used. This issue
+  was causing a crash in clang-tidy. (#GH136041)
+
 Improvements
 ^^^^^^^^^^^^
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index ec2a140e04d5b..7180447e250ce 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -7567,9 +7567,13 @@ def warn_arith_conv_mixed_enum_types_cxx20 : Warning<
   "%sub{select_arith_conv_kind}0 "
   "different enumeration types%diff{ ($ and $)|}1,2 is deprecated">,
   InGroup<DeprecatedEnumEnumConversion>;
-def err_conv_mixed_enum_types_cxx26 : Error<
+
+def err_conv_mixed_enum_types: Error <
   "invalid %sub{select_arith_conv_kind}0 "
   "different enumeration types%diff{ ($ and $)|}1,2">;
+def zzzz_warn_conv_mixed_enum_types_cxx26 : Warning <
+  err_conv_mixed_enum_types.Summary>,
+  InGroup<EnumEnumConversion>, DefaultError;
 
 def warn_arith_conv_mixed_anon_enum_types : Warning<
   warn_arith_conv_mixed_enum_types.Summary>,
diff --git a/clang/include/clang/Driver/Distro.h b/clang/include/clang/Driver/Distro.h
index b4d485dac8a26..c544a8c002191 100644
--- a/clang/include/clang/Driver/Distro.h
+++ b/clang/include/clang/Driver/Distro.h
@@ -39,6 +39,8 @@ class Distro {
     DebianBullseye,
     DebianBookworm,
     DebianTrixie,
+    DebianForky,
+    DebianDuke,
     Exherbo,
     RHEL5,
     RHEL6,
@@ -128,7 +130,7 @@ class Distro {
   bool IsOpenSUSE() const { return DistroVal == OpenSUSE; }
 
   bool IsDebian() const {
-    return DistroVal >= DebianLenny && DistroVal <= DebianTrixie;
+    return DistroVal >= DebianLenny && DistroVal <= DebianDuke;
   }
 
   bool IsUbuntu() const {
diff --git a/clang/include/clang/Interpreter/Interpreter.h b/clang/include/clang/Interpreter/Interpreter.h
index 56213f88b9e30..f8663e3193a18 100644
--- a/clang/include/clang/Interpreter/Interpreter.h
+++ b/clang/include/clang/Interpreter/Interpreter.h
@@ -116,6 +116,9 @@ class Interpreter {
   /// Compiler instance performing the incremental compilation.
   std::unique_ptr<CompilerInstance> CI;
 
+  /// An optional compiler instance for CUDA offloading
+  std::unique_ptr<CompilerInstance> DeviceCI;
+
 protected:
   // Derived classes can use an extended interface of the Interpreter.
   Interpreter(std::unique_ptr<CompilerInstance> Instance, llvm::Error &Err,
diff --git a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def
index 34bb7a809162b..dbb8e832db5ff 100644
--- a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def
+++ b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def
@@ -385,6 +385,19 @@ ANALYZER_OPTION(
     "flex\" won't be analyzed.",
     true)
 
+ANALYZER_OPTION(
+    bool, InlineFunctionsWithAmbiguousLoops, "inline-functions-with-ambiguous-loops",
+    "If disabled (the default), the analyzer puts functions on a \"do not "
+    "inline this\" list if it finds an execution path within that function "
+    "that may potentially perform 'analyzer-max-loop' (= 4 by default) "
+    "iterations in a loop. (Note that functions that _definitely_ reach the "
+    "loop limit on some execution path are currently marked as \"do not "
+    "inline\" even if this option is enabled.) Enabling this option "
+    "eliminates this (somewhat arbitrary) restriction from the analysis "
+    "scope, which increases the analysis runtime (on average by ~10%, but "
+    "a few translation units may see much larger slowdowns).",
+    false)
+
 //===----------------------------------------------------------------------===//
 // Unsigned analyzer options.
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/FunctionSummary.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/FunctionSummary.h
index 3ee0d229cfc29..761395260a0cf 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/FunctionSummary.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/FunctionSummary.h
@@ -81,10 +81,6 @@ class FunctionSummariesTy {
     I->second.MayInline = 0;
   }
 
-  void markReachedMaxBlockCount(const Decl *D) {
-    markShouldNotInline(D);
-  }
-
   std::optional<bool> mayInline(const Decl *D) {
     MapTy::const_iterator I = Map.find(D);
     if (I != Map.end() && I->second.InlineChecked)
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 5aae78dd2fee7..209b269122a8e 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -3311,7 +3311,11 @@ static bool HandleLValueBase(EvalInfo &Info, const Expr *E, LValue &Obj,
     return false;
 
   // Extract most-derived object and corresponding type.
-  DerivedDecl = D.MostDerivedType->getAsCXXRecordDecl();
+  // FIXME: After implementing P2280R4 it became possible to get references
+  // here. We do MostDerivedType->getAsCXXRecordDecl() in several other
+  // locations and if we see crashes in those locations in the future
+  // it may make more sense to move this fix into Lvalue::set.
+  DerivedDecl = D.MostDerivedType.getNonReferenceType()->getAsCXXRecordDecl();
   if (!CastToDerivedClass(Info, E, Obj, DerivedDecl, D.MostDerivedPathLength))
     return false;
 
@@ -3521,7 +3525,12 @@ static bool evaluateVarDeclInit(EvalInfo &Info, const Expr *E,
   // should begin within the evaluation of E
   // Used to be C++20 [expr.const]p5.12.2:
   // ... its lifetime began within the evaluation of E;
-  if (isa<ParmVarDecl>(VD) && !AllowConstexprUnknown) {
+  if (isa<ParmVarDecl>(VD)) {
+    if (AllowConstexprUnknown) {
+      Result = &Info.CurrentCall->createConstexprUnknownAPValues(VD, Base);
+      return true;
+    }
+
     // Assume parameters of a potential constant expression are usable in
     // constant expressions.
     if (!Info.checkingPotentialConstantExpression() ||
@@ -12710,11 +12719,13 @@ static bool determineEndOffset(EvalInfo &Info, SourceLocation ExprLoc,
   bool DetermineForCompleteObject = refersToCompleteObject(LVal);
 
   auto CheckedHandleSizeof = [&](QualType Ty, CharUnits &Result) {
-    if (Ty.isNull() || Ty->isIncompleteType() || Ty->isFunctionType())
+    if (Ty.isNull())
       return false;
 
-    if (Ty->isReferenceType())
-      Ty = Ty.getNonReferenceType();
+    Ty = Ty.getNonReferenceType();
+
+    if (Ty->isIncompleteType() || Ty->isFunctionType())
+      return false;
 
     return HandleSizeof(Info, ExprLoc, Ty, Result);
   };
diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index 42b735ccf4a2c..cb35dbd611204 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -3552,7 +3552,21 @@ void MicrosoftCXXNameMangler::mangleType(const DependentSizedExtVectorType *T,
 
 void MicrosoftCXXNameMangler::mangleType(const ConstantMatrixType *T,
                                          Qualifiers quals, SourceRange Range) {
-  Error(Range.getBegin(), "matrix type") << Range;
+  QualType EltTy = T->getElementType();
+
+  llvm::SmallString<64> TemplateMangling;
+  llvm::raw_svector_ostream Stream(TemplateMangling);
+  MicrosoftCXXNameMangler Extra(Context, Stream);
+
+  Stream << "?$";
+
+  Extra.mangleSourceName("__matrix");
+  Extra.mangleType(EltTy, Range, QMM_Escape);
+
+  Extra.mangleIntegerLiteral(llvm::APSInt::getUnsigned(T->getNumRows()));
+  Extra.mangleIntegerLiteral(llvm::APSInt::getUnsigned(T->getNumColumns()));
+
+  mangleArtificialTagType(TagTypeKind::Struct, TemplateMangling, {"__clang"});
 }
 
 void MicrosoftCXXNameMangler::mangleType(const DependentSizedMatrixType *T,
diff --git a/clang/lib/AST/NestedNameSpecifier.cpp b/clang/lib/AST/NestedNameSpecifier.cpp
index 76c77569da9fd..c043996f1ada3 100644
--- a/clang/lib/AST/NestedNameSpecifier.cpp
+++ b/clang/lib/AST/NestedNameSpecifier.cpp
@@ -283,13 +283,16 @@ void NestedNameSpecifier::print(raw_ostream &OS, const PrintingPolicy &Policy,
   case TypeSpec: {
     const auto *Record =
             dyn_cast_or_null<ClassTemplateSpecializationDecl>(getAsRecordDecl());
-    if (ResolveTemplateArguments && Record) {
+    const TemplateParameterList *TPL = nullptr;
+    if (Record) {
+      TPL = Record->getSpecializedTemplate()->getTemplateParameters();
+      if (ResolveTemplateArguments) {
         // Print the type trait with resolved template parameters.
         Record->printName(OS, Policy);
-        printTemplateArgumentList(
-            OS, Record->getTemplateArgs().asArray(), Policy,
-            Record->getSpecializedTemplate()->getTemplateParameters());
+        printTemplateArgumentList(OS, Record->getTemplateArgs().asArray(),
+                                  Policy, TPL);
         break;
+      }
     }
     const Type *T = getAsType();
 
@@ -313,8 +316,8 @@ void NestedNameSpecifier::print(raw_ostream &OS, const PrintingPolicy &Policy,
                                         TemplateName::Qualified::None);
 
       // Print the template argument list.
-      printTemplateArgumentList(OS, SpecType->template_arguments(),
-                                InnerPolicy);
+      printTemplateArgumentList(OS, SpecType->template_arguments(), InnerPolicy,
+                                TPL);
     } else if (const auto *DepSpecType =
                    dyn_cast<DependentTemplateSpecializationType>(T)) {
       // Print the template name without its corresponding
@@ -322,7 +325,7 @@ void NestedNameSpecifier::print(raw_ostream &OS, const PrintingPolicy &Policy,
       OS << DepSpecType->getIdentifier()->getName();
       // Print the template argument list.
       printTemplateArgumentList(OS, DepSpecType->template_arguments(),
-                                InnerPolicy);
+                                InnerPolicy, TPL);
     } else {
       // Print the type normally
       QualType(T, 0).print(OS, InnerPolicy);
diff --git a/clang/lib/Analysis/LiveVariables.cpp b/clang/lib/Analysis/LiveVariables.cpp
index 481932ee59c8e..5fb5ee767a683 100644
--- a/clang/lib/Analysis/LiveVariables.cpp
+++ b/clang/lib/Analysis/LiveVariables.cpp
@@ -662,12 +662,19 @@ void LiveVariables::dumpExprLiveness(const SourceManager &M) {
 }
 
 void LiveVariablesImpl::dumpExprLiveness(const SourceManager &M) {
+  const ASTContext &Ctx = analysisContext.getASTContext();
+  auto ByIDs = [&Ctx](const Expr *L, const Expr *R) {
+    return L->getID(Ctx) < R->getID(Ctx);
+  };
+
   // Don't iterate over blockEndsToLiveness directly because it's not sorted.
   for (const CFGBlock *B : *analysisContext.getCFG()) {
-
     llvm::errs() << "\n[ B" << B->getBlockID()
                  << " (live expressions at block exit) ]\n";
-    for (const Expr *E : blocksEndToLiveness[B].liveExprs) {
+    std::vector<const Expr *> LiveExprs;
+    llvm::append_range(LiveExprs, blocksEndToLiveness[B].liveExprs);
+    llvm::sort(LiveExprs, ByIDs);
+    for (const Expr *E : LiveExprs) {
       llvm::errs() << "\n";
       E->dump();
     }
diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h
index 991efd2bde01f..4cf4230273d38 100644
--- a/clang/lib/Basic/Targets/OSTargets.h
+++ b/clang/lib/Basic/Targets/OSTargets.h
@@ -618,14 +618,7 @@ class LLVM_LIBRARY_VISIBILITY SolarisTargetInfo : public OSTargetInfo<Target> {
     DefineStd(Builder, "unix", Opts);
     Builder.defineMacro("__svr4__");
     Builder.defineMacro("__SVR4");
-    // Solaris headers require _XOPEN_SOURCE to be set to 600 for C99 and
-    // newer, but to 500 for everything else.  feature_test.h has a check to
-    // ensure that you are not using C99 with an old version of X/Open or C89
-    // with a new version.
-    if (Opts.C99)
-      Builder.defineMacro("_XOPEN_SOURCE", "600");
-    else
-      Builder.defineMacro("_XOPEN_SOURCE", "500");
+    Builder.defineMacro("_XOPEN_SOURCE", "600");
     if (Opts.CPlusPlus) {
       Builder.defineMacro("__C99FEATURES__");
       Builder.defineMacro("_FILE_OFFSET_BITS", "64");
diff --git a/clang/lib/Basic/Targets/SystemZ.cpp b/clang/lib/Basic/Targets/SystemZ.cpp
index c836d110d26d5..6326188b3bd18 100644
--- a/clang/lib/Basic/Targets/SystemZ.cpp
+++ b/clang/lib/Basic/Targets/SystemZ.cpp
@@ -105,7 +105,7 @@ static constexpr ISANameRevision ISARevisions[] = {
   {{"arch12"}, 12}, {{"z14"}, 12},
   {{"arch13"}, 13}, {{"z15"}, 13},
   {{"arch14"}, 14}, {{"z16"}, 14},
-  {{"arch15"}, 15},
+  {{"arch15"}, 15}, {{"z17"}, 15},
 };
 
 int SystemZTargetInfo::getISARevision(StringRef Name) const {
diff --git a/clang/lib/Driver/Distro.cpp b/clang/lib/Driver/Distro.cpp
index 3cc79535de8da..71ba71fa18379 100644
--- a/clang/lib/Driver/Distro.cpp
+++ b/clang/lib/Driver/Distro.cpp
@@ -160,6 +160,10 @@ static Distro::DistroType DetectDistro(llvm::vfs::FileSystem &VFS) {
         return Distro::DebianBookworm;
       case 13:
         return Distro::DebianTrixie;
+      case 14:
+        return Distro::DebianForky;
+      case 15:
+        return Distro::DebianDuke;
       default:
         return Distro::UnknownDistro;
       }
@@ -173,6 +177,8 @@ static Distro::DistroType DetectDistro(llvm::vfs::FileSystem &VFS) {
         .Case("bullseye/sid", Distro::DebianBullseye)
         .Case("bookworm/sid", Distro::DebianBookworm)
         .Case("trixie/sid", Distro::DebianTrixie)
+        .Case("forky/sid", Distro::DebianForky)
+        .Case("duke/sid", Distro::DebianDuke)
         .Default(Distro::UnknownDistro);
   }
 
diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
index 0575a1ebef3a6..1666253db54cb 100644
--- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
@@ -253,6 +253,7 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
         Features.push_back("+lsx");
     } else /*-mno-lsx*/ {
       Features.push_back("-lsx");
+      Features.push_back("-lasx");
     }
   }
 
diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index 6f7d213c0b559..d953348b0258d 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -1452,7 +1452,9 @@ unsigned ContinuationIndenter::getNewLineColumn(const LineState &State) {
        (PreviousNonComment->ClosesTemplateDeclaration ||
         PreviousNonComment->ClosesRequiresClause ||
         (PreviousNonComment->is(TT_AttributeMacro) &&
-         Current.isNot(tok::l_paren)) ||
+         Current.isNot(tok::l_paren) &&
+         !Current.endsSequence(TT_StartOfName, TT_AttributeMacro,
+                               TT_PointerOrReference)) ||
         PreviousNonComment->isOneOf(
             TT_AttributeRParen, TT_AttributeSquare, TT_FunctionAnnotationRParen,
             TT_JavaAnnotation, TT_LeadingJavaAnnotation))) ||
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index b97d8928178b5..aba7db6dd50a8 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -3743,8 +3743,10 @@ reformat(const FormatStyle &Style, StringRef Code,
     tooling::Replacements Replaces =
         Formatter(*Env, Style, Status).process().first;
     // add a replacement to remove the "x = " from the result.
-    Replaces = Replaces.merge(
-        tooling::Replacements(tooling::Replacement(FileName, 0, 4, "")));
+    if (Code.starts_with("x = ")) {
+      Replaces = Replaces.merge(
+          tooling::Replacements(tooling::Replacement(FileName, 0, 4, "")));
+    }
     // apply the reformatting changes and the removal of "x = ".
     if (applyAllReplacements(Code, Replaces))
       return {Replaces, 0};
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index 16f0a76f3a954..0755a5d355394 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -636,6 +636,36 @@ bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
   return true;
 }
 
+void FormatTokenLexer::tryParseJavaTextBlock() {
+  if (FormatTok->TokenText != "\"\"")
+    return;
+
+  const auto *S = Lex->getBufferLocation();
+  const auto *End = Lex->getBuffer().end();
+
+  if (S == End || *S != '\"')
+    return;
+
+  ++S; // Skip the `"""` that begins a text block.
+
+  // Find the `"""` that ends the text block.
+  for (int Count = 0; Count < 3 && S < End; ++S) {
+    switch (*S) {
+    case '\\':
+      Count = -1;
+      break;
+    case '\"':
+      ++Count;
+      break;
+    default:
+      Count = 0;
+    }
+  }
+
+  // Ignore the possibly invalid text block.
+  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(S)));
+}
+
 // Tries to parse a JavaScript Regex literal starting at the current token,
 // if that begins with a slash and is in a location where JavaScript allows
 // regex literals. Changes the current token to a regex literal and updates
@@ -1326,6 +1356,9 @@ FormatToken *FormatTokenLexer::getNextToken() {
     FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
     ++Column;
     StateStack.push(LexerState::TOKEN_STASHED);
+  } else if (Style.Language == FormatStyle::LK_Java &&
+             FormatTok->is(tok::string_literal)) {
+    tryParseJavaTextBlock();
   }
 
   if (Style.isVerilog() && Tokens.size() > 0 &&
diff --git a/clang/lib/Format/FormatTokenLexer.h b/clang/lib/Format/FormatTokenLexer.h
index 61474a3f9ada8..d9a25c8ef3538 100644
--- a/clang/lib/Format/FormatTokenLexer.h
+++ b/clang/lib/Format/FormatTokenLexer.h
@@ -71,6 +71,8 @@ class FormatTokenLexer {
 
   bool canPrecedeRegexLiteral(FormatToken *Prev);
 
+  void tryParseJavaTextBlock();
+
   // Tries to parse a JavaScript Regex literal starting at the current token,
   // if that begins with a slash and is in a location where JavaScript allows
   // regex literals. Changes the current token to a regex literal and updates
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 11b941c5a0411..0c13356ca96de 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -3839,6 +3839,8 @@ static bool isFunctionDeclarationName(const LangOptions &LangOpts,
   } else {
     if (Current.isNot(TT_StartOfName) || Current.NestingLevel != 0)
       return false;
+    while (Next && Next->startsSequence(tok::hashhash, tok::identifier))
+      Next = Next->Next->Next;
     for (; Next; Next = Next->Next) {
       if (Next->is(TT_TemplateOpener) && Next->MatchingParen) {
         Next = Next->MatchingParen;
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 2b348c926294e..673b3e6c4b8c2 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -1837,8 +1837,8 @@ void UnwrappedLineParser::parseStructuralElement(
       nextToken();
       if (FormatTok->is(tok::l_paren)) {
         parseParens();
-        assert(FormatTok->Previous);
-        if (FormatTok->Previous->endsSequence(tok::r_paren, tok::kw_auto,
+        if (FormatTok->Previous &&
+            FormatTok->Previous->endsSequence(tok::r_paren, tok::kw_auto,
                                               tok::l_paren)) {
           Line->SeenDecltypeAuto = true;
         }
@@ -2581,7 +2581,8 @@ bool UnwrappedLineParser::parseParens(TokenType AmpAmpTokenType) {
       if (Prev) {
         auto OptionalParens = [&] {
           if (MightBeStmtExpr || MightBeFoldExpr || Line->InMacroBody ||
-              SeenComma || Style.RemoveParentheses == FormatStyle::RPS_Leave) {
+              SeenComma || Style.RemoveParentheses == FormatStyle::RPS_Leave ||
+              RParen->getPreviousNonComment() == LParen) {
             return false;
           }
           const bool DoubleParens =
diff --git a/clang/lib/Interpreter/DeviceOffload.cpp b/clang/lib/Interpreter/DeviceOffload.cpp
index 7d0125403ea52..05625ddedb72f 100644
--- a/clang/lib/Interpreter/DeviceOffload.cpp
+++ b/clang/lib/Interpreter/DeviceOffload.cpp
@@ -25,13 +25,12 @@
 namespace clang {
 
 IncrementalCUDADeviceParser::IncrementalCUDADeviceParser(
-    std::unique_ptr<CompilerInstance> DeviceInstance,
-    CompilerInstance &HostInstance,
+    CompilerInstance &DeviceInstance, CompilerInstance &HostInstance,
     llvm::IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> FS,
     llvm::Error &Err, const std::list<PartialTranslationUnit> &PTUs)
-    : IncrementalParser(*DeviceInstance, Err), PTUs(PTUs), VFS(FS),
+    : IncrementalParser(DeviceInstance, Err), PTUs(PTUs), VFS(FS),
       CodeGenOpts(HostInstance.getCodeGenOpts()),
-      TargetOpts(DeviceInstance->getTargetOpts()) {
+      TargetOpts(DeviceInstance.getTargetOpts()) {
   if (Err)
     return;
   StringRef Arch = TargetOpts.CPU;
@@ -41,7 +40,6 @@ IncrementalCUDADeviceParser::IncrementalCUDADeviceParser(
                                                llvm::inconvertibleErrorCode()));
     return;
   }
-  DeviceCI = std::move(DeviceInstance);
 }
 
 llvm::Expected<llvm::StringRef> IncrementalCUDADeviceParser::GeneratePTX() {
diff --git a/clang/lib/Interpreter/DeviceOffload.h b/clang/lib/Interpreter/DeviceOffload.h
index 43645033c4840..0b903e31c6799 100644
--- a/clang/lib/Interpreter/DeviceOffload.h
+++ b/clang/lib/Interpreter/DeviceOffload.h
@@ -28,8 +28,7 @@ class IncrementalCUDADeviceParser : public IncrementalParser {
 
 public:
   IncrementalCUDADeviceParser(
-      std::unique_ptr<CompilerInstance> DeviceInstance,
-      CompilerInstance &HostInstance,
+      CompilerInstance &DeviceInstance, CompilerInstance &HostInstance,
       llvm::IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> VFS,
       llvm::Error &Err, const std::list<PartialTranslationUnit> &PTUs);
 
@@ -42,7 +41,6 @@ class IncrementalCUDADeviceParser : public IncrementalParser {
   ~IncrementalCUDADeviceParser();
 
 protected:
-  std::unique_ptr<CompilerInstance> DeviceCI;
   int SMVersion;
   llvm::SmallString<1024> PTXCode;
   llvm::SmallVector<char, 1024> FatbinContent;
diff --git a/clang/lib/Interpreter/IncrementalParser.cpp b/clang/lib/Interpreter/IncrementalParser.cpp
index e43cea1baf43a..1d223e230669c 100644
--- a/clang/lib/Interpreter/IncrementalParser.cpp
+++ b/clang/lib/Interpreter/IncrementalParser.cpp
@@ -175,7 +175,7 @@ void IncrementalParser::CleanUpPTU(TranslationUnitDecl *MostRecentTU) {
   // FIXME: We should de-allocate MostRecentTU
   for (Decl *D : MostRecentTU->decls()) {
     auto *ND = dyn_cast<NamedDecl>(D);
-    if (!ND)
+    if (!ND || ND->getDeclName().isEmpty())
       continue;
     // Check if we need to clean up the IdResolver chain.
     if (ND->getDeclName().getFETokenInfo() && !D->getLangOpts().ObjC &&
diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp
index f91563dd0378c..3b81f9d701b42 100644
--- a/clang/lib/Interpreter/Interpreter.cpp
+++ b/clang/lib/Interpreter/Interpreter.cpp
@@ -416,6 +416,10 @@ Interpreter::Interpreter(std::unique_ptr<CompilerInstance> Instance,
 Interpreter::~Interpreter() {
   IncrParser.reset();
   Act->FinalizeAction();
+  if (DeviceParser)
+    DeviceParser.reset();
+  if (DeviceAct)
+    DeviceAct->FinalizeAction();
   if (IncrExecutor) {
     if (llvm::Error Err = IncrExecutor->cleanUp())
       llvm::report_fatal_error(
@@ -501,8 +505,11 @@ Interpreter::createWithCUDA(std::unique_ptr<CompilerInstance> CI,
 
   DCI->ExecuteAction(*Interp->DeviceAct);
 
+  Interp->DeviceCI = std::move(DCI);
+
   auto DeviceParser = std::make_unique<IncrementalCUDADeviceParser>(
-      std::move(DCI), *Interp->getCompilerInstance(), IMVFS, Err, Interp->PTUs);
+      *Interp->DeviceCI, *Interp->getCompilerInstance(), IMVFS, Err,
+      Interp->PTUs);
 
   if (Err)
     return std::move(Err);
diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index 0cadede51a9b3..2fab1dfed4a00 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -2237,8 +2237,6 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) {
             if (PP.isCodeCompletionReached() && !CalledSignatureHelp)
               RunSignatureHelp();
             LHS = ExprError();
-          } else if (!HasError && HasTrailingComma) {
-            Diag(Tok, diag::err_expected_expression);
           } else if (LHS.isInvalid()) {
             for (auto &E : ArgExprs)
               Actions.CorrectDelayedTyposInExpr(E);
@@ -3738,7 +3736,6 @@ bool Parser::ParseExpressionList(SmallVectorImpl<Expr *> &Exprs,
     if (Tok.is(tok::r_paren)) {
       if (HasTrailingComma)
         *HasTrailingComma = true;
-      break;
     }
   }
   if (SawError) {
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index e253e3a17328f..23d0f9532d4f8 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -1519,7 +1519,7 @@ static void checkEnumArithmeticConversions(Sema &S, Expr *LHS, Expr *RHS,
     // In C++ 26, usual arithmetic conversions between 2 different enum types
     // are ill-formed.
     if (S.getLangOpts().CPlusPlus26)
-      DiagID = diag::err_conv_mixed_enum_types_cxx26;
+      DiagID = diag::zzzz_warn_conv_mixed_enum_types_cxx26;
     else if (!L->castAs<EnumType>()->getDecl()->hasNameForLinkage() ||
              !R->castAs<EnumType>()->getDecl()->hasNameForLinkage()) {
       // If either enumeration type is unnamed, it's less likely that the
diff --git a/clang/lib/Serialization/TemplateArgumentHasher.cpp b/clang/lib/Serialization/TemplateArgumentHasher.cpp
index 598f098f526d0..5fd6941256fe2 100644
--- a/clang/lib/Serialization/TemplateArgumentHasher.cpp
+++ b/clang/lib/Serialization/TemplateArgumentHasher.cpp
@@ -65,7 +65,9 @@ void TemplateArgumentHasher::AddTemplateArgument(TemplateArgument TA) {
 
   switch (Kind) {
   case TemplateArgument::Null:
-    llvm_unreachable("Expected valid TemplateArgument");
+    // These can occur in incomplete substitutions performed with code
+    // completion (see PartialOverloading).
+    break;
   case TemplateArgument::Type:
     AddQualType(TA.getAsType());
     break;
diff --git a/clang/lib/StaticAnalyzer/Checkers/DynamicTypePropagation.cpp b/clang/lib/StaticAnalyzer/Checkers/DynamicTypePropagation.cpp
index a0bf776b11f53..e58329817d7cd 100644
--- a/clang/lib/StaticAnalyzer/Checkers/DynamicTypePropagation.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/DynamicTypePropagation.cpp
@@ -379,9 +379,9 @@ void DynamicTypePropagation::checkPostCall(const CallEvent &Call,
         // aggregates, and in such case no top-frame constructor will be called.
         // Figure out if we need to do anything in this case.
         // FIXME: Instead of relying on the ParentMap, we should have the
-        // trigger-statement (InitListExpr in this case) available in this
-        // callback, ideally as part of CallEvent.
-        if (isa_and_nonnull<InitListExpr>(
+        // trigger-statement (InitListExpr or CXXParenListInitExpr in this case)
+        // available in this callback, ideally as part of CallEvent.
+        if (isa_and_nonnull<InitListExpr, CXXParenListInitExpr>(
                 LCtx->getParentMap().getParent(Ctor->getOriginExpr())))
           return;
 
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index 140c77790496d..cfb8be2e7f0f8 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -2510,6 +2510,20 @@ bool ExprEngine::replayWithoutInlining(ExplodedNode *N,
   return true;
 }
 
+/// Return the innermost location context which is inlined at `Node`, unless
+/// it's the top-level (entry point) location context.
+static const LocationContext *getInlinedLocationContext(ExplodedNode *Node,
+                                                        ExplodedGraph &G) {
+  const LocationContext *CalleeLC = Node->getLocation().getLocationContext();
+  const LocationContext *RootLC =
+      (*G.roots_begin())->getLocation().getLocationContext();
+
+  if (CalleeLC->getStackFrame() == RootLC->getStackFrame())
+    return nullptr;
+
+  return CalleeLC;
+}
+
 /// Block entrance.  (Update counters).
 void ExprEngine::processCFGBlockEntrance(const BlockEdge &L,
                                          NodeBuilderWithSinks &nodeBuilder,
@@ -2557,21 +2571,24 @@ void ExprEngine::processCFGBlockEntrance(const BlockEdge &L,
     const ExplodedNode *Sink =
                    nodeBuilder.generateSink(Pred->getState(), Pred, &tag);
 
-    // Check if we stopped at the top level function or not.
-    // Root node should have the location context of the top most function.
-    const LocationContext *CalleeLC = Pred->getLocation().getLocationContext();
-    const LocationContext *CalleeSF = CalleeLC->getStackFrame();
-    const LocationContext *RootLC =
-                        (*G.roots_begin())->getLocation().getLocationContext();
-    if (RootLC->getStackFrame() != CalleeSF) {
-      Engine.FunctionSummaries->markReachedMaxBlockCount(CalleeSF->getDecl());
+    if (const LocationContext *LC = getInlinedLocationContext(Pred, G)) {
+      // FIXME: This will unconditionally prevent inlining this function (even
+      // from other entry points), which is not a reasonable heuristic: even if
+      // we reached max block count on this particular execution path, there
+      // may be other execution paths (especially with other parametrizations)
+      // where the analyzer can reach the end of the function (so there is no
+      // natural reason to avoid inlining it). However, disabling this would
+      // significantly increase the analysis time (because more entry points
+      // would exhaust their allocated budget), so it must be compensated by a
+      // different (more reasonable) reduction of analysis scope.
+      Engine.FunctionSummaries->markShouldNotInline(
+          LC->getStackFrame()->getDecl());
 
       // Re-run the call evaluation without inlining it, by storing the
       // no-inlining policy in the state and enqueuing the new work item on
       // the list. Replay should almost never fail. Use the stats to catch it
       // if it does.
-      if ((!AMgr.options.NoRetryExhausted &&
-           replayWithoutInlining(Pred, CalleeLC)))
+      if ((!AMgr.options.NoRetryExhausted && replayWithoutInlining(Pred, LC)))
         return;
       NumMaxBlockCountReachedInInlined++;
     } else
@@ -2835,8 +2852,29 @@ void ExprEngine::processBranch(
       // conflicts with the widen-loop analysis option (which is off by
       // default). If we intend to support and stabilize the loop widening,
       // we must ensure that it 'plays nicely' with this logic.
-      if (!SkipTrueBranch || AMgr.options.ShouldWidenLoops)
+      if (!SkipTrueBranch || AMgr.options.ShouldWidenLoops) {
         Builder.generateNode(StTrue, true, PredN);
+      } else if (!AMgr.options.InlineFunctionsWithAmbiguousLoops) {
+        // FIXME: There is an ancient and arbitrary heuristic in
+        // `ExprEngine::processCFGBlockEntrance` which prevents all further
+        // inlining of a function if it finds an execution path within that
+        // function which reaches the `MaxBlockVisitOnPath` limit (a/k/a
+        // `analyzer-max-loop`, by default four iterations in a loop). Adding
+        // this "don't assume third iteration" logic significantly increased
+        // the analysis runtime on some inputs because less functions were
+        // arbitrarily excluded from being inlined, so more entry points used
+        // up their full allocated budget. As a hacky compensation for this,
+        // here we apply the "should not inline" mark in cases when the loop
+        // could potentially reach the `MaxBlockVisitOnPath` limit without the
+        // "don't assume third iteration" logic. This slightly overcompensates
+        // (activates if the third iteration can be entered, and will not
+        // recognize cases where the fourth iteration would't be completed), but
+        // should be good enough for practical purposes.
+        if (const LocationContext *LC = getInlinedLocationContext(Pred, G)) {
+          Engine.FunctionSummaries->markShouldNotInline(
+              LC->getStackFrame()->getDecl());
+        }
+      }
     }
 
     if (StFalse)
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
index f7020da2e6da2..30839a40389ba 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
@@ -637,9 +637,10 @@ void ExprEngine::handleConstructor(const Expr *E,
     // FIXME: For now this code essentially bails out. We need to find the
     // correct target region and set it.
     // FIXME: Instead of relying on the ParentMap, we should have the
-    // trigger-statement (InitListExpr in this case) passed down from CFG or
-    // otherwise always available during construction.
-    if (isa_and_nonnull<InitListExpr>(LCtx->getParentMap().getParent(E))) {
+    // trigger-statement (InitListExpr or CXXParenListInitExpr in this case)
+    // passed down from CFG or otherwise always available during construction.
+    if (isa_and_nonnull<InitListExpr, CXXParenListInitExpr>(
+            LCtx->getParentMap().getParent(E))) {
       MemRegionManager &MRMgr = getSValBuilder().getRegionManager();
       Target = loc::MemRegionVal(MRMgr.getCXXTempObjectRegion(E, LCtx));
       CallOpts.IsCtorOrDtorWithImproperlyModeledTargetRegion = true;
@@ -1010,7 +1011,8 @@ void ExprEngine::VisitCXXNewExpr(const CXXNewExpr *CNE, ExplodedNode *Pred,
       // values are properly placed inside the required region, however if an
       // initializer list is used, this doesn't happen automatically.
       auto *Init = CNE->getInitializer();
-      bool isInitList = isa_and_nonnull<InitListExpr>(Init);
+      bool isInitList =
+          isa_and_nonnull<InitListExpr, CXXParenListInitExpr>(Init);
 
       QualType ObjTy =
           isInitList ? Init->getType() : CNE->getType()->getPointeeType();
diff --git a/clang/test/Analysis/PR135665.cpp b/clang/test/Analysis/PR135665.cpp
new file mode 100644
index 0000000000000..124b8c9b97b04
--- /dev/null
+++ b/clang/test/Analysis/PR135665.cpp
@@ -0,0 +1,19 @@
+// RUN: %clang_analyze_cc1 -std=c++20 -analyzer-checker=core -verify %s
+
+// expected-no-diagnostics
+
+template<typename... F>
+struct overload : public F...
+{
+  using F::operator()...;
+};
+
+template<typename... F>
+overload(F&&...) -> overload<F...>;
+
+int main()
+{
+  const auto l = overload([](const int* i) {});
+
+  return 0;
+}
diff --git a/clang/test/Analysis/analyzer-config.c b/clang/test/Analysis/analyzer-config.c
index d5eb790b82f23..b47ca59e79827 100644
--- a/clang/test/Analysis/analyzer-config.c
+++ b/clang/test/Analysis/analyzer-config.c
@@ -88,6 +88,7 @@
 // CHECK-NEXT: graph-trim-interval = 1000
 // CHECK-NEXT: ignore-bison-generated-files = true
 // CHECK-NEXT: ignore-flex-generated-files = true
+// CHECK-NEXT: inline-functions-with-ambiguous-loops = false
 // CHECK-NEXT: inline-lambdas = true
 // CHECK-NEXT: ipa = dynamic-bifurcate
 // CHECK-NEXT: ipa-always-inline-size = 3
diff --git a/clang/test/Analysis/live-stmts.cpp b/clang/test/Analysis/live-stmts.cpp
index c60f522588e39..ca2ff6da8b133 100644
--- a/clang/test/Analysis/live-stmts.cpp
+++ b/clang/test/Analysis/live-stmts.cpp
@@ -44,6 +44,8 @@ int testThatDumperWorks(int x, int y, int z) {
 // CHECK-NEXT: ImplicitCastExpr {{.*}} <IntegralToBoolean>
 // CHECK-NEXT: `-ImplicitCastExpr {{.*}} <LValueToRValue>
 // CHECK-NEXT:   `-DeclRefExpr {{.*}} 'x' 'int'
+// CHECK-EMPTY:
+// CHECK-EMPTY:
 // CHECK: [ B4 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-NEXT: DeclRefExpr {{.*}} 'y' 'int'
diff --git a/clang/test/Analysis/loop-based-inlining-prevention.c b/clang/test/Analysis/loop-based-inlining-prevention.c
new file mode 100644
index 0000000000000..73627112e2d32
--- /dev/null
+++ b/clang/test/Analysis/loop-based-inlining-prevention.c
@@ -0,0 +1,200 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -verify=expected,default %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -analyzer-config inline-functions-with-ambiguous-loops=true -verify=expected,enabled %s
+
+// This file tests some heuristics in the engine that put functions on a
+// "do not inline" list if their analyisis reaches the `analyzer-max-loop`
+// limit (by default 4 iterations) in a loop. This was almost surely intended
+// as memoization optimization for the "retry without inlining" fallback (if we
+// had to retry once, next time don't even try inlining), but aggressively
+// oversteps the "natural" scope: reaching 4 iterations on _one particular_
+// execution path does not imply that each path would need "retry without
+// inlining" especially if a different call receives different arguments.
+//
+// This heuristic significantly affects the scope/depth of the analysis (and
+// therefore the execution time) because without this limitation on the
+// inlining significantly more entry points would be able to exhaust their
+// `max-nodes` quota. (Trivial thin wrappers around big complex functions are
+// common in many projects.)
+//
+// Unfortunately, this arbitrary heuristic strongly relies on the current loop
+// handling model and its many limitations, so improvements in loop handling
+// can cause surprising slowdowns by reducing the "do not inline" blacklist.
+// In the tests "FIXME-BUT-NEEDED" comments mark "problematic" (aka buggy)
+// analyzer behavior which cannot be fixed without also improving the
+// heuristics for (not) inlining large functions.
+
+  int getNum(void); // Get an unknown symbolic number.
+
+void clang_analyzer_dump(int arg);
+
+//-----------------------------------------------------------------------------
+// Simple case: inlined function never reaches `analyzer-max-loop`, so it is
+// always inlined.
+
+int inner_simple(int callIdx) {
+  clang_analyzer_dump(callIdx); // expected-warning {{1 S32}}
+                                // expected-warning@-1 {{2 S32}}
+  return 42;
+}
+
+int outer_simple(void) {
+  int x = inner_simple(1);
+  int y = inner_simple(2);
+  return 53 / (x - y); // expected-warning {{Division by zero}}
+}
+
+//-----------------------------------------------------------------------------
+// Inlined function always reaches `analyzer-max-loop`, which stops the
+// analysis on that path and puts the function on the "do not inline" list.
+
+int inner_fixed_loop_1(int callIdx) {
+  int i;
+  clang_analyzer_dump(callIdx); // expected-warning {{1 S32}}
+  for (i = 0; i < 10; i++); // FIXME-BUT-NEEDED: This stops the analysis.
+  clang_analyzer_dump(callIdx); // no-warning
+  return 42;
+}
+
+int outer_fixed_loop_1(void) {
+  int x = inner_fixed_loop_1(1);
+  int y = inner_fixed_loop_1(2);
+
+ // FIXME-BUT-NEEDED: The analysis doesn't reach this zero division.
+  return 53 / (x - y); // no-warning
+}
+
+//-----------------------------------------------------------------------------
+// Inlined function always reaches `analyzer-max-loop`; inlining is prevented
+// even for different entry points.
+// NOTE: the analyzer happens to analyze the entry points in a reversed order,
+// so `outer_2_fixed_loop_2` is analyzed first and it will be the one which is
+// able to inline the inner function.
+
+int inner_fixed_loop_2(int callIdx) {
+  // Identical copy of inner_fixed_loop_1.
+  int i;
+  clang_analyzer_dump(callIdx); // expected-warning {{2 S32}}
+  for (i = 0; i < 10; i++); // FIXME-BUT-NEEDED: This stops the analysis.
+  clang_analyzer_dump(callIdx); // no-warning
+  return 42;
+}
+
+int outer_1_fixed_loop_2(void) {
+  return inner_fixed_loop_2(1);
+}
+
+int outer_2_fixed_loop_2(void) {
+  return inner_fixed_loop_2(2);
+}
+
+//-----------------------------------------------------------------------------
+// Inlined function reaches `analyzer-max-loop` only in its second call. The
+// function is inlined twice but the second call doesn't finish and ends up
+// being conservatively evaluated.
+
+int inner_parametrized_loop_1(int count) {
+  int i;
+  clang_analyzer_dump(count); // expected-warning {{2 S32}}
+                              // expected-warning@-1 {{10 S32}}
+  for (i = 0; i < count; i++);
+      // FIXME-BUT-NEEDED: This loop stops the analysis when count >=4.
+  clang_analyzer_dump(count); // expected-warning {{2 S32}}
+  return 42;
+}
+
+int outer_parametrized_loop_1(void) {
+  int x = inner_parametrized_loop_1(2);
+  int y = inner_parametrized_loop_1(10);
+
+ // FIXME-BUT-NEEDED: The analysis doesn't reach this zero division.
+  return 53 / (x - y); // no-warning
+}
+
+//-----------------------------------------------------------------------------
+// Inlined function reaches `analyzer-max-loop` on its first call, so the
+// second call isn't inlined (although it could be fully evaluated).
+
+int inner_parametrized_loop_2(int count) {
+  // Identical copy of inner_parametrized_loop_1.
+  int i;
+  clang_analyzer_dump(count); // expected-warning {{10 S32}}
+  for (i = 0; i < count; i++);
+      // FIXME-BUT-NEEDED: This loop stops the analysis when count >=4.
+  clang_analyzer_dump(count); // no-warning
+  return 42;
+}
+
+int outer_parametrized_loop_2(void) {
+  int y = inner_parametrized_loop_2(10);
+  int x = inner_parametrized_loop_2(2);
+
+ // FIXME-BUT-NEEDED: The analysis doesn't reach this zero division.
+  return 53 / (x - y); // no-warning
+}
+
+//-----------------------------------------------------------------------------
+// Inlined function may or may not reach `analyzer-max-loop` depending on an
+// ambiguous check before the loop. This is very similar to the "fixed loop"
+// cases: the function is placed on the "don't inline" list when any execution
+// path reaches `analyzer-max-loop` (even if other execution paths reach the
+// end of the function).
+// NOTE: This is tested with two separate entry points to ensure that one
+// inlined call is fully evaluated before we try to inline the other call.
+// NOTE: the analyzer happens to analyze the entry points in a reversed order,
+// so `outer_2_conditional_loop` is analyzed first and it will be the one which
+// is able to inline the inner function.
+
+int inner_conditional_loop(int callIdx) {
+  int i;
+  clang_analyzer_dump(callIdx); // expected-warning {{2 S32}}
+  if (getNum() == 777) {
+    for (i = 0; i < 10; i++);
+  }
+  clang_analyzer_dump(callIdx); // expected-warning {{2 S32}}
+  return 42;
+}
+
+int outer_1_conditional_loop(void) {
+  return inner_conditional_loop(1);
+}
+
+int outer_2_conditional_loop(void) {
+  return inner_conditional_loop(2);
+}
+
+//-----------------------------------------------------------------------------
+// Inlined function executes an ambiguous loop that may or may not reach
+// `analyzer-max-loop`. Historically, before the "don't assume third iteration"
+// commit (bb27d5e5c6b194a1440b8ac4e5ace68d0ee2a849) this worked like the
+// `conditional_loop` cases: the analyzer was able to find a path reaching
+// `analyzer-max-loop` so inlining was disabled. After that commit the analyzer
+// does not _assume_ a third (or later) iteration (i.e. does not enter those
+// iterations if the loop condition is an unknown value), so e.g. this test
+// function does not reach `analyzer-max-loop` iterations and the inlining is
+// not disabled.
+// Unfortunately this change significantly increased the workload and
+// runtime of the analyzer (more entry points used up their budget), so the
+// option `inline-functions-with-ambiguous-loops` was introduced and disabled
+// by default to suppress the inlining in situations where the "don't assume
+// third iteration" logic activates.
+// NOTE: This is tested with two separate entry points to ensure that one
+// inlined call is fully evaluated before we try to inline the other call.
+// NOTE: the analyzer happens to analyze the entry points in a reversed order,
+// so `outer_2_ambiguous_loop` is analyzed first and it will be the one which
+// is able to inline the inner function.
+
+int inner_ambiguous_loop(int callIdx) {
+  int i;
+  clang_analyzer_dump(callIdx); // default-warning {{2 S32}}
+                                // enabled-warning@-1 {{1 S32}}
+                                // enabled-warning@-2 {{2 S32}}
+  for (i = 0; i < getNum(); i++);
+  return i;
+}
+
+int outer_1_ambiguous_loop(void) {
+  return inner_ambiguous_loop(1);
+}
+int outer_2_ambiguous_loop(void) {
+  return inner_ambiguous_loop(2);
+}
diff --git a/clang/test/Analysis/loop-unrolling.cpp b/clang/test/Analysis/loop-unrolling.cpp
index bf05a7739ce48..ebae81e000c7a 100644
--- a/clang/test/Analysis/loop-unrolling.cpp
+++ b/clang/test/Analysis/loop-unrolling.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -analyzer-config unroll-loops=true,cfg-loopexit=true -verify -std=c++14 -analyzer-config exploration_strategy=unexplored_first_queue %s
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -analyzer-config unroll-loops=true,cfg-loopexit=true,exploration_strategy=dfs -verify -std=c++14 -DDFS=1 %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -analyzer-config unroll-loops=true,cfg-loopexit=true -verify=expected,default -std=c++14 -analyzer-config exploration_strategy=unexplored_first_queue %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -analyzer-config unroll-loops=true,cfg-loopexit=true,exploration_strategy=dfs -verify=expected,dfs -std=c++14 %s
 
 void clang_analyzer_numTimesReached();
 void clang_analyzer_warnIfReached();
@@ -337,6 +337,7 @@ int nested_both_unrolled() {
 }
 
 int simple_known_bound_loop() {
+  // Iteration count visible: can be unrolled and fully executed.
   for (int i = 2; i < 12; i++) {
     // This function is inlined in nested_inlined_unroll1()
     clang_analyzer_numTimesReached(); // expected-warning {{90}}
@@ -345,27 +346,42 @@ int simple_known_bound_loop() {
 }
 
 int simple_unknown_bound_loop() {
+  // Iteration count unknown: unrolling won't happen and the execution will be
+  // split two times:
+  // (1) split between skipped loop (immediate exit) and entering the loop
+  // (2) split between exit after 1 iteration and entering the second iteration
+  // After these there is no third state split because the "don't assume third
+  // iteration" logic in `ExprEngine::processBranch` prevents it; but the
+  // `legacy-inlining-prevention` logic will put this function onto the list of
+  // functions that may not be inlined in the future.
+  // The exploration strategy apparently influences the number of times this
+  // function can be inlined before it's placed on the "don't inline" list.
   for (int i = 2; i < getNum(); i++) {
-    clang_analyzer_numTimesReached(); // expected-warning {{8}}
+    clang_analyzer_numTimesReached(); // default-warning {{4}} dfs-warning {{8}}
   }
   return 0;
 }
 
 int nested_inlined_unroll1() {
+  // Here the analyzer can unroll and fully execute both the outer loop and the
+  // inner loop within simple_known_bound_loop().
   int k;
   for (int i = 0; i < 9; i++) {
     clang_analyzer_numTimesReached(); // expected-warning {{9}}
-    k = simple_known_bound_loop();    // no reevaluation without inlining
+    k = simple_known_bound_loop();
   }
   int a = 22 / k; // expected-warning {{Division by zero}}
   return 0;
 }
 
 int nested_inlined_no_unroll1() {
+  // Here no unrolling happens and we only run `analyzer-max-loop` (= 4)
+  // iterations of the loop within this function, but some state splits happen
+  // in `simple_unknown_bound_loop()` calls.
   int k;
-  for (int i = 0; i < 9; i++) {
-    clang_analyzer_numTimesReached(); // expected-warning {{10}}
-    k = simple_unknown_bound_loop();  // reevaluation without inlining, splits the state as well
+  for (int i = 0; i < 40; i++) {
+    clang_analyzer_numTimesReached(); // default-warning {{9}} dfs-warning {{12}}
+    k = simple_unknown_bound_loop(); 
   }
   int a = 22 / k; // no-warning
   return 0;
diff --git a/clang/test/CodeCompletion/GH139019.cpp b/clang/test/CodeCompletion/GH139019.cpp
new file mode 100644
index 0000000000000..fed35b38362a1
--- /dev/null
+++ b/clang/test/CodeCompletion/GH139019.cpp
@@ -0,0 +1,26 @@
+// RUN: rm -rf %t
+// RUN: mkdir %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/test.hpp -emit-pch -o %t/1.pch
+// RUN: %clang_cc1 -std=c++20 %t/test.cpp -include-pch %t/1.pch -code-completion-at=%t/test.cpp:7:17
+
+//--- test.hpp
+#pragma once
+class provider_t
+{
+  public:
+    template<class T>
+    void emit(T *data)
+    {}
+};
+
+//--- test.cpp
+#include "test.hpp"
+
+void test()
+{
+    provider_t *focus;
+    void *data;
+    focus->emit(&data);
+}
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-bitop.c b/clang/test/CodeGen/SystemZ/builtins-systemz-bitop.c
index 5b4051c8d6f17..717a7d7ab49e2 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-bitop.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-bitop.c
@@ -1,6 +1,6 @@
 // REQUIRES: systemz-registered-target
-// RUN: %clang_cc1 -target-cpu arch15 -triple s390x-ibm-linux -Wall -Wno-unused -Werror -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -target-cpu arch15 -triple s390x-ibm-linux -Wall -Wno-unused -Werror -emit-llvm -x c++ %s -o - | FileCheck %s
+// RUN: %clang_cc1 -target-cpu z17 -triple s390x-ibm-linux -Wall -Wno-unused -Werror -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -target-cpu z17 -triple s390x-ibm-linux -Wall -Wno-unused -Werror -emit-llvm -x c++ %s -o - | FileCheck %s
 
 unsigned long test_bdepg(unsigned long a, unsigned long b) {
 // CHECK-LABEL: test_bdepg
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-vector5-error.c b/clang/test/CodeGen/SystemZ/builtins-systemz-vector5-error.c
index 3943a15af9d2f..8275b9ddb88a8 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-vector5-error.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-vector5-error.c
@@ -1,5 +1,5 @@
 // REQUIRES: systemz-registered-target
-// RUN: %clang_cc1 -target-cpu arch15 -triple s390x-unknown-unknown \
+// RUN: %clang_cc1 -target-cpu z17 -triple s390x-unknown-unknown \
 // RUN: -Wall -Wno-unused -Werror -fsyntax-only -verify %s
 
 typedef __attribute__((vector_size(16))) signed char vec_schar;
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-vector5.c b/clang/test/CodeGen/SystemZ/builtins-systemz-vector5.c
index c3621819e71f9..b765fa64b33d4 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-vector5.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-vector5.c
@@ -1,5 +1,5 @@
 // REQUIRES: systemz-registered-target
-// RUN: %clang_cc1 -target-cpu arch15 -triple s390x-ibm-linux -flax-vector-conversions=none \
+// RUN: %clang_cc1 -target-cpu z17 -triple s390x-ibm-linux -flax-vector-conversions=none \
 // RUN: -Wall -Wno-unused -Werror -emit-llvm %s -o - | FileCheck %s
 
 typedef __attribute__((vector_size(16))) signed char vec_schar;
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector5-error.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector5-error.c
index 9f4844efd6312..79041b923068e 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector5-error.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector5-error.c
@@ -1,5 +1,5 @@
 // REQUIRES: systemz-registered-target
-// RUN: %clang_cc1 -target-cpu arch15 -triple s390x-linux-gnu \
+// RUN: %clang_cc1 -target-cpu z17 -triple s390x-linux-gnu \
 // RUN: -fzvector -flax-vector-conversions=none \
 // RUN: -Wall -Wno-unused -Werror -fsyntax-only -verify %s
 
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector5.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector5.c
index 7a29dbf552e0b..6ee9e1ee3a117 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector5.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector5.c
@@ -1,8 +1,8 @@
 // REQUIRES: systemz-registered-target
-// RUN: %clang_cc1 -target-cpu arch15 -triple s390x-linux-gnu \
+// RUN: %clang_cc1 -target-cpu z17 -triple s390x-linux-gnu \
 // RUN: -O2 -fzvector -flax-vector-conversions=none \
 // RUN: -Wall -Wno-unused -Werror -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -target-cpu arch15 -triple s390x-linux-gnu \
+// RUN: %clang_cc1 -target-cpu z17 -triple s390x-linux-gnu \
 // RUN: -O2 -fzvector -flax-vector-conversions=none \
 // RUN: -Wall -Wno-unused -Werror -S %s -o - | FileCheck %s --check-prefix=CHECK-ASM
 
diff --git a/clang/test/CodeGen/SystemZ/systemz-abi-vector.c b/clang/test/CodeGen/SystemZ/systemz-abi-vector.c
index 1e1926678ec33..e5704709a3a33 100644
--- a/clang/test/CodeGen/SystemZ/systemz-abi-vector.c
+++ b/clang/test/CodeGen/SystemZ/systemz-abi-vector.c
@@ -18,6 +18,8 @@
 // RUN:   -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-VECTOR %s
 // RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch14 \
 // RUN:   -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-VECTOR %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu z17 \
+// RUN:   -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-VECTOR %s
 // RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch15 \
 // RUN:   -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-VECTOR %s
 
diff --git a/clang/test/CodeGen/SystemZ/systemz-abi.c b/clang/test/CodeGen/SystemZ/systemz-abi.c
index 58081bdc6cc2a..7de425950e9fd 100644
--- a/clang/test/CodeGen/SystemZ/systemz-abi.c
+++ b/clang/test/CodeGen/SystemZ/systemz-abi.c
@@ -24,6 +24,8 @@
 // RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch14 \
 // RUN:   -emit-llvm -o - %s -mfloat-abi soft | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,SOFT-FLOAT
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu z17 \
+// RUN:   -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,HARD-FLOAT
 // RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch15 \
 // RUN:   -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,HARD-FLOAT
 // RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch15 \
diff --git a/clang/test/CodeGenCXX/mangle-ms-matrix.cpp b/clang/test/CodeGenCXX/mangle-ms-matrix.cpp
new file mode 100644
index 0000000000000..b244aa6e33cfa
--- /dev/null
+++ b/clang/test/CodeGenCXX/mangle-ms-matrix.cpp
@@ -0,0 +1,57 @@
+// RUN: %clang_cc1 -fenable-matrix -fms-extensions -fcxx-exceptions -ffreestanding -target-feature +avx -emit-llvm %s -o - -triple=i686-pc-win32 | FileCheck %s
+// RUN: %clang_cc1 -fenable-matrix -fms-extensions -fcxx-exceptions -ffreestanding -target-feature +avx -emit-llvm %s -o - -triple=i686-pc-win32 -fexperimental-new-constant-interpreter | FileCheck %s
+
+typedef float __attribute__((matrix_type(4, 4))) m4x4f;
+typedef float __attribute__((matrix_type(2, 2))) m2x2f;
+
+typedef int __attribute__((matrix_type(4, 4))) m4x4i;
+typedef int __attribute__((matrix_type(2, 2))) m2x2i;
+
+void thow(int i) {
+  switch (i) {
+    case 0: throw m4x4f();
+    // CHECK: ??_R0U?$__matrix@M$03$03@__clang@@@8
+    // CHECK: _CT??_R0U?$__matrix@M$03$03@__clang@@@864
+    // CHECK: _CTA1U?$__matrix@M$03$03@__clang@@
+    // CHECK: _TI1U?$__matrix@M$03$03@__clang@@
+    case 1: throw m2x2f();
+    // CHECK: ??_R0U?$__matrix@M$01$01@__clang@@@8
+    // CHECK: _CT??_R0U?$__matrix@M$01$01@__clang@@@816
+    // CHECK: _CTA1U?$__matrix@M$01$01@__clang@@
+    // CHECK: _TI1U?$__matrix@M$01$01@__clang@@
+    case 2: throw m4x4i();
+    // CHECK: ??_R0U?$__matrix@H$03$03@__clang@@@8
+    // CHECK: _CT??_R0U?$__matrix@H$03$03@__clang@@@864
+    // CHECK: _CTA1U?$__matrix@H$03$03@__clang@@
+    // CHECK: _TI1U?$__matrix@H$03$03@__clang@@
+    case 3: throw m2x2i();
+    // CHECK: ??_R0U?$__matrix@H$01$01@__clang@@@8
+    // CHECK: _CT??_R0U?$__matrix@H$01$01@__clang@@@816
+    // CHECK: _CTA1U?$__matrix@H$01$01@__clang@@
+    // CHECK: _TI1U?$__matrix@H$01$01@__clang@@
+  }
+}
+
+void foo44f(m4x4f) {}
+// CHECK: define dso_local void @"?foo44f@@YAXU?$__matrix@M$03$03@__clang@@@Z"
+
+m4x4f rfoo44f() { return m4x4f(); }
+// CHECK: define dso_local noundef <16 x float> @"?rfoo44f@@YAU?$__matrix@M$03$03@__clang@@XZ"
+
+void foo22f(m2x2f) {}
+// CHECK: define dso_local void @"?foo22f@@YAXU?$__matrix@M$01$01@__clang@@@Z"
+
+m2x2f rfoo22f() { return m2x2f(); }
+// CHECK: define dso_local noundef <4 x float> @"?rfoo22f@@YAU?$__matrix@M$01$01@__clang@@XZ"
+
+void foo44i(m4x4i) {}
+// CHECK: define dso_local void @"?foo44i@@YAXU?$__matrix@H$03$03@__clang@@@Z"
+
+m4x4i rfoo44i() { return m4x4i(); }
+// CHECK: define dso_local noundef <16 x i32> @"?rfoo44i@@YAU?$__matrix@H$03$03@__clang@@XZ"
+
+void foo22i(m2x2i) {}
+// CHECK: define dso_local void @"?foo22i@@YAXU?$__matrix@H$01$01@__clang@@@Z"
+
+m2x2i rfoo22i() { return m2x2i(); }
+// CHECK: define dso_local noundef <4 x i32> @"?rfoo22i@@YAU?$__matrix@H$01$01@__clang@@XZ"
\ No newline at end of file
diff --git a/clang/test/Driver/print-enabled-extensions/aarch64-fujitsu-monaka.c b/clang/test/Driver/print-enabled-extensions/aarch64-fujitsu-monaka.c
index a80d0f5c79ec1..29e9682d58700 100644
--- a/clang/test/Driver/print-enabled-extensions/aarch64-fujitsu-monaka.c
+++ b/clang/test/Driver/print-enabled-extensions/aarch64-fujitsu-monaka.c
@@ -28,6 +28,8 @@
 // CHECK-NEXT:     FEAT_FP16                                              Enable half-precision floating-point data processing
 // CHECK-NEXT:     FEAT_FP8                                               Enable FP8 instructions
 // CHECK-NEXT:     FEAT_FP8DOT2                                           Enable FP8 2-way dot instructions
+// CHECK-NEXT:     FEAT_FP8DOT4                                           Enable FP8 4-way dot instructions
+// CHECK-NEXT:     FEAT_FP8FMA                                            Enable Armv9.5-A FP8 multiply-add instructions
 // CHECK-NEXT:     FEAT_FPAC                                              Enable Armv8.3-A Pointer Authentication Faulting enhancement
 // CHECK-NEXT:     FEAT_FRINTTS                                           Enable FRInt[32|64][Z|X] instructions that round a floating-point number to an integer (in FP format) forcing it to fit into a 32- or 64-bit int
 // CHECK-NEXT:     FEAT_FlagM                                             Enable Armv8.4-A Flag Manipulation instructions
diff --git a/clang/test/Driver/systemz-march.c b/clang/test/Driver/systemz-march.c
index 93a11c6c9c013..8922db9f2d5d6 100644
--- a/clang/test/Driver/systemz-march.c
+++ b/clang/test/Driver/systemz-march.c
@@ -15,6 +15,7 @@
 // RUN: %clang -target s390x -### -S -emit-llvm -march=arch13 %s 2>&1 | FileCheck --check-prefix=CHECK-ARCH13 %s
 // RUN: %clang -target s390x -### -S -emit-llvm -march=z16 %s 2>&1 | FileCheck --check-prefix=CHECK-Z16 %s
 // RUN: %clang -target s390x -### -S -emit-llvm -march=arch14 %s 2>&1 | FileCheck --check-prefix=CHECK-ARCH14 %s
+// RUN: %clang -target s390x -### -S -emit-llvm -march=z17 %s 2>&1 | FileCheck --check-prefix=CHECK-Z17 %s
 // RUN: %clang -target s390x -### -S -emit-llvm -march=arch15 %s 2>&1 | FileCheck --check-prefix=CHECK-ARCH15 %s
 
 // CHECK-Z9: error: unknown target CPU 'z9'
@@ -32,6 +33,7 @@
 // CHECK-ARCH13: "-target-cpu" "arch13"
 // CHECK-Z16: "-target-cpu" "z16"
 // CHECK-ARCH14: "-target-cpu" "arch14"
+// CHECK-Z17: "-target-cpu" "z17"
 // CHECK-ARCH15: "-target-cpu" "arch15"
 
 int x;
diff --git a/clang/test/Interpreter/lambda.cpp b/clang/test/Interpreter/lambda.cpp
index df75274a050b2..fee6c73bf95cb 100644
--- a/clang/test/Interpreter/lambda.cpp
+++ b/clang/test/Interpreter/lambda.cpp
@@ -1,7 +1,11 @@
 // REQUIRES: host-supports-jit
 // UNSUPPORTED: system-aix
+// At -O2, somehow "x = 42" appears first when piped into FileCheck,
+// see https://github.com/llvm/llvm-project/issues/143547.
+// UNSUPPORTED: system-windows
 // RUN: cat %s | clang-repl | FileCheck %s
-// RUN: cat %s | clang-repl -Xcc -O2 | FileCheck %s
+// RUN: cat %s | clang-repl -Xcc -Xclang -Xcc -verify -Xcc -O2 | FileCheck %s
+
 extern "C" int printf(const char *, ...);
 
 auto l1 = []() { printf("ONE\n"); return 42; };
@@ -14,4 +18,14 @@ auto r2 = l2();
 auto r3 = l2();
 // CHECK: TWO
 
-%quit
+// Verify non-local lambda capture error is correctly reported
+int x = 42;
+
+// expected-error {{non-local lambda expression cannot have a capture-default}}
+auto capture = [&]() { return x * 2; };
+
+// Ensure interpreter continues and x is still valid
+printf("x = %d\n", x);
+// CHECK: x = 42
+
+%quit
\ No newline at end of file
diff --git a/clang/test/Misc/target-invalid-cpu-note/systemz.c b/clang/test/Misc/target-invalid-cpu-note/systemz.c
index b70173f5feec2..021c280d53190 100644
--- a/clang/test/Misc/target-invalid-cpu-note/systemz.c
+++ b/clang/test/Misc/target-invalid-cpu-note/systemz.c
@@ -20,4 +20,5 @@
 // CHECK-SAME: {{^}}, arch14
 // CHECK-SAME: {{^}}, z16
 // CHECK-SAME: {{^}}, arch15
+// CHECK-SAME: {{^}}, z17
 // CHECK-SAME: {{$}}
diff --git a/clang/test/Parser/recovery.cpp b/clang/test/Parser/recovery.cpp
index 2fce67a52c6b6..261f5dc99bad4 100644
--- a/clang/test/Parser/recovery.cpp
+++ b/clang/test/Parser/recovery.cpp
@@ -222,3 +222,21 @@ void k() {
   func(1, ); // expected-error {{expected expression}}
 }
 }
+
+namespace GH136254 {
+
+void call() {
+  [a(42, )]() {} (); // expected-error {{expected expression}}
+
+  int *b = new int(42, ); // expected-error {{expected expression}}
+
+  struct S {
+    int c;
+
+    S() : c(42, ) {} // expected-error {{expected expression}}
+  };
+
+  int d(42, ); // expected-error {{expected expression}}
+}
+
+}
diff --git a/clang/test/Preprocessor/init-loongarch.c b/clang/test/Preprocessor/init-loongarch.c
index ac461b371162f..71a266b8a9157 100644
--- a/clang/test/Preprocessor/init-loongarch.c
+++ b/clang/test/Preprocessor/init-loongarch.c
@@ -946,6 +946,10 @@
 // RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
 // RUN: %clang --target=loongarch64 -mno-lasx -mno-lsx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
+// RUN: %clang --target=loongarch64 -march=la464 -mno-lsx -x c -E -dM %s -o - \
+// RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
+// RUN: %clang --target=loongarch64 -mno-lsx -march=la464 -x c -E -dM %s -o - \
+// RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
 // MNO-LSX-NOT: #define __loongarch_asx
 // MNO-LSX-NOT: #define __loongarch_simd_width
 // MNO-LSX-NOT: #define __loongarch_sx
diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c
index f267f1759cdb5..2d17891071aae 100644
--- a/clang/test/Preprocessor/predefined-arch-macros.c
+++ b/clang/test/Preprocessor/predefined-arch-macros.c
@@ -4394,6 +4394,9 @@
 // RUN: %clang -march=arch15 -E -dM %s -o - 2>&1 \
 // RUN:     -target s390x-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SYSTEMZ_ARCH15
+// RUN: %clang -march=z17 -E -dM %s -o - 2>&1 \
+// RUN:     -target s390x-unknown-linux \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SYSTEMZ_ARCH15
 // CHECK_SYSTEMZ_ARCH15: #define __ARCH__ 15
 // CHECK_SYSTEMZ_ARCH15: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
 // CHECK_SYSTEMZ_ARCH15: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
diff --git a/clang/test/SemaCXX/builtin-object-size-cxx14.cpp b/clang/test/SemaCXX/builtin-object-size-cxx14.cpp
index b7c6f6be01f54..fdd3cb7af088f 100644
--- a/clang/test/SemaCXX/builtin-object-size-cxx14.cpp
+++ b/clang/test/SemaCXX/builtin-object-size-cxx14.cpp
@@ -1,5 +1,7 @@
 // RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx14 -std=c++14 %s
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c++2a %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++2b %s
+
 
 typedef __SIZE_TYPE__ size_t;
 
@@ -119,3 +121,13 @@ constexpr int bos_new() { // cxx14-error {{constant expression}}
   void *p = new int; // cxx14-note {{until C++20}}
   return __builtin_object_size(p, 0);
 }
+
+
+namespace GH129397 {
+
+struct incomplete;
+void test(incomplete &ref) {
+  __builtin_object_size(&ref, 1);
+}
+
+}
diff --git a/clang/test/SemaCXX/constant-expression-p2280r4.cpp b/clang/test/SemaCXX/constant-expression-p2280r4.cpp
index 6c9a87267109c..dbaebb81b93e8 100644
--- a/clang/test/SemaCXX/constant-expression-p2280r4.cpp
+++ b/clang/test/SemaCXX/constant-expression-p2280r4.cpp
@@ -179,3 +179,36 @@ namespace extern_reference_used_as_unknown {
   int y;
   constinit int& g = (x,y); // expected-warning {{left operand of comma operator has no effect}}
 }
+
+namespace GH139452 {
+struct Dummy {
+  explicit operator bool() const noexcept { return true; }
+};
+
+struct Base { int error; };
+struct Derived : virtual Base { };
+
+template <class R>
+constexpr R get_value() {
+    const auto& derived_val = Derived{};
+    if (derived_val.error != 0)
+        /* nothing */;
+    return R{};
+}
+
+int f() {
+    return !get_value<Dummy>(); // contextually convert the function call result to bool
+}
+}
+
+namespace param_reference {
+  constexpr int arbitrary = -12345;
+  constexpr void f(const int &x = arbitrary) { // expected-note {{declared here}}
+    constexpr const int &v1 = x; // expected-error {{must be initialized by a constant expression}} \
+    // expected-note {{reference to 'x' is not a constant expression}}
+    constexpr const int &v2 = (x, arbitrary); // expected-warning {{left operand of comma operator has no effect}}
+    constexpr int v3 = x; // expected-error {{must be initialized by a constant expression}}
+    static_assert(x==arbitrary); // expected-error {{static assertion expression is not an integral constant expression}}
+    static_assert(&x - &x == 0);
+  }
+}
diff --git a/clang/test/SemaCXX/cxx2c-enum-compare.cpp b/clang/test/SemaCXX/cxx2c-enum-compare.cpp
index f47278a60725e..96fbd368b1696 100644
--- a/clang/test/SemaCXX/cxx2c-enum-compare.cpp
+++ b/clang/test/SemaCXX/cxx2c-enum-compare.cpp
@@ -1,9 +1,10 @@
-// RUN: %clang_cc1 %s -std=c++2c -fsyntax-only -verify -triple %itanium_abi_triple
+// RUN: %clang_cc1 %s -std=c++2c -fsyntax-only -verify=both,expected
+// RUN: %clang_cc1 %s -std=c++2c -fsyntax-only -verify=both -Wno-enum-enum-conversion
 
 enum E1 { e };
 enum E2 { f };
 void test() {
-    int b = e <= 3.7; // expected-error {{invalid comparison of enumeration type 'E1' with floating-point type 'double'}}
+    int b = e <= 3.7; // both-error {{invalid comparison of enumeration type 'E1' with floating-point type 'double'}}
     int k = f - e; // expected-error {{invalid arithmetic between different enumeration types ('E2' and 'E1')}}
     int x = 1 ? e : f; // expected-error {{invalid conditional expression between different enumeration types ('E1' and 'E2')}}
 }
diff --git a/clang/tools/clang-format/ClangFormat.cpp b/clang/tools/clang-format/ClangFormat.cpp
index 28610052b9b74..96eec7c666a38 100644
--- a/clang/tools/clang-format/ClangFormat.cpp
+++ b/clang/tools/clang-format/ClangFormat.cpp
@@ -492,8 +492,8 @@ static bool format(StringRef FileName, bool ErrorOnIncompleteFormat = false) {
   // To format JSON insert a variable to trick the code into thinking its
   // JavaScript.
   if (IsJson && !FormatStyle->DisableFormat) {
-    auto Err = Replaces.add(tooling::Replacement(
-        tooling::Replacement(AssumedFileName, 0, 0, "x = ")));
+    auto Err =
+        Replaces.add(tooling::Replacement(AssumedFileName, 0, 0, "x = "));
     if (Err)
       llvm::errs() << "Bad Json variable insertion\n";
   }
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index bf3eff129efd5..1afcc75a2e19e 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -12419,6 +12419,13 @@ TEST_F(FormatTest, UnderstandsAttributes) {
   verifyFormat("SomeType s __unused{InitValue};", CustomAttrs);
   verifyFormat("SomeType *__capability s(InitValue);", CustomAttrs);
   verifyFormat("SomeType *__capability s{InitValue};", CustomAttrs);
+
+  auto Style = getLLVMStyleWithColumns(60);
+  Style.AttributeMacros.push_back("my_fancy_attr");
+  Style.PointerAlignment = FormatStyle::PAS_Left;
+  verifyFormat("void foo(const MyLongTypeNameeeeeeeeeeeee* my_fancy_attr\n"
+               "             testttttttttt);",
+               Style);
 }
 
 TEST_F(FormatTest, UnderstandsPointerQualifiersInCast) {
@@ -13962,6 +13969,8 @@ TEST_F(FormatTest, IncorrectCodeUnbalancedBraces) {
   verifyNoCrash("struct Foo {\n"
                 "  operator foo(bar\n"
                 "};");
+  verifyNoCrash("decltype( {\n"
+                "  {");
 }
 
 TEST_F(FormatTest, IncorrectUnbalancedBracesInMacrosWithUnicode) {
@@ -27895,6 +27904,8 @@ TEST_F(FormatTest, RemoveParentheses) {
   verifyFormat("return ((... && std::is_convertible_v<TArgsLocal, TArgs>));",
                "return (((... && std::is_convertible_v<TArgsLocal, TArgs>)));",
                Style);
+  verifyFormat("MOCK_METHOD(void, Function, (), override);",
+               "MOCK_METHOD(void, Function, (), (override));", Style);
 
   Style.RemoveParentheses = FormatStyle::RPS_ReturnStatement;
   verifyFormat("#define Return0 return (0);", Style);
diff --git a/clang/unittests/Format/FormatTestJava.cpp b/clang/unittests/Format/FormatTestJava.cpp
index 33998bc7ff858..d0a3b4eb96d69 100644
--- a/clang/unittests/Format/FormatTestJava.cpp
+++ b/clang/unittests/Format/FormatTestJava.cpp
@@ -789,6 +789,63 @@ TEST_F(FormatTestJava, AlignCaseArrows) {
                Style);
 }
 
+TEST_F(FormatTestJava, TextBlock) {
+  verifyNoChange("String myStr = \"\"\"\n"
+                 "hello\n"
+                 "there\n"
+                 "\"\"\";");
+
+  verifyNoChange("String tb = \"\"\"\n"
+                 "            the new\"\"\";");
+
+  verifyNoChange("System.out.println(\"\"\"\n"
+                 "    This is the first line\n"
+                 "    This is the second line\n"
+                 "    \"\"\");");
+
+  verifyNoChange("void writeHTML() {\n"
+                 "  String html = \"\"\" \n"
+                 "                <html>\n"
+                 "                    <p>Hello World.</p>\n"
+                 "                </html>\n"
+                 "\"\"\";\n"
+                 "  writeOutput(html);\n"
+                 "}");
+
+  verifyNoChange("String colors = \"\"\"\t\n"
+                 "    red\n"
+                 "    green\n"
+                 "    blue\"\"\".indent(4);");
+
+  verifyNoChange("String code = \"\"\"\n"
+                 "    String source = \\\"\"\"\n"
+                 "        String message = \"Hello, World!\";\n"
+                 "        System.out.println(message);\n"
+                 "        \\\"\"\";\n"
+                 "    \"\"\";");
+
+  verifyNoChange(
+      "class Outer {\n"
+      "  void printPoetry() {\n"
+      "    String lilacs = \"\"\"\n"
+      "Passing the apple-tree blows of white and pink in the orchards\n"
+      "\"\"\";\n"
+      "    System.out.println(lilacs);\n"
+      "  }\n"
+      "}");
+
+  verifyNoChange("String name = \"\"\"\r\n"
+                 "        red\n"
+                 "        green\n"
+                 "        blue\\\n"
+                 "    \"\"\";");
+
+  verifyFormat("String name = \"\"\"Pat Q. Smith\"\"\";");
+
+  verifyNoChange("String name = \"\"\"\n"
+                 "              Pat Q. Smith");
+}
+
 } // namespace
 } // namespace test
 } // namespace format
diff --git a/clang/unittests/Format/FormatTestRawStrings.cpp b/clang/unittests/Format/FormatTestRawStrings.cpp
index 0615fb1fad4c5..3f09c7b6086e5 100644
--- a/clang/unittests/Format/FormatTestRawStrings.cpp
+++ b/clang/unittests/Format/FormatTestRawStrings.cpp
@@ -988,6 +988,28 @@ fffffffffffffffffffff("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
                       )pb");)test",
                    Style));
 }
+
+TEST_F(FormatTestRawStrings, Json) {
+  auto Style = getLLVMStyle();
+  Style.RawStringFormats = {
+      {
+          /*Language=*/FormatStyle::LK_Json,
+          /*Delimiters=*/{"json"},
+          /*EnclosingFunctions=*/{},
+          /*CanonicalDelimiter=*/"",
+          /*BasedOnStyle=*/"llvm",
+      },
+  };
+
+  EXPECT_EQ("json = R\"json({\n"
+            "                \"str\": \"test\"\n"
+            "              })json\";",
+            format("json = R\"json({\n"
+                   "  \"str\": \"test\"\n"
+                   "})json\";",
+                   Style));
+}
+
 } // end namespace
 } // end namespace format
 } // end namespace clang
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 757db66c3e298..602c2d5eba29a 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -2187,6 +2187,13 @@ TEST_F(TokenAnnotatorTest, UnderstandsFunctionDeclarationNames) {
   EXPECT_TOKEN(Tokens[1], tok::identifier, TT_FunctionDeclarationName);
   EXPECT_TOKEN(Tokens[2], tok::l_paren, TT_FunctionDeclarationLParen);
 
+  Tokens = annotate("#define FUNC(foo, bar, baz) \\\n"
+                    "  auto foo##bar##baz() -> Type {}");
+  ASSERT_EQ(Tokens.size(), 23u) << Tokens;
+  EXPECT_TOKEN(Tokens[11], tok::identifier, TT_FunctionDeclarationName);
+  EXPECT_TOKEN(Tokens[16], tok::l_paren, TT_FunctionDeclarationLParen);
+  EXPECT_TOKEN(Tokens[18], tok::arrow, TT_TrailingReturnArrow);
+
   Tokens = annotate("int iso_time(time_t);");
   ASSERT_EQ(Tokens.size(), 7u) << Tokens;
   EXPECT_TOKEN(Tokens[1], tok::identifier, TT_FunctionDeclarationName);
diff --git a/clang/unittests/Tooling/QualTypeNamesTest.cpp b/clang/unittests/Tooling/QualTypeNamesTest.cpp
index 5ded64d4fcc8c..49c40d633ad4b 100644
--- a/clang/unittests/Tooling/QualTypeNamesTest.cpp
+++ b/clang/unittests/Tooling/QualTypeNamesTest.cpp
@@ -265,6 +265,102 @@ TEST(QualTypeNameTest, InlineNamespace) {
                           TypeNameVisitor::Lang_CXX11);
 }
 
+TEST(QualTypeNameTest, TemplatedClass) {
+  std::unique_ptr<ASTUnit> AST =
+      tooling::buildASTFromCode("template <unsigned U1> struct A {\n"
+                                "  template <unsigned U2> struct B {};\n"
+                                "};\n"
+                                "template struct A<1>;\n"
+                                "template struct A<2u>;\n"
+                                "template struct A<1>::B<3>;\n"
+                                "template struct A<2u>::B<4u>;\n");
+
+  auto &Context = AST->getASTContext();
+  auto &Policy = Context.getPrintingPolicy();
+  auto getFullyQualifiedName = [&](QualType QT) {
+    return TypeName::getFullyQualifiedName(QT, Context, Policy);
+  };
+
+  auto *A = Context.getTranslationUnitDecl()
+                ->lookup(&Context.Idents.get("A"))
+                .find_first<ClassTemplateDecl>();
+  ASSERT_NE(A, nullptr);
+
+  // A has two explicit instantiations: A<1> and A<2u>
+  auto ASpec = A->spec_begin();
+  ASSERT_NE(ASpec, A->spec_end());
+  auto *A1 = *ASpec;
+  ASpec++;
+  ASSERT_NE(ASpec, A->spec_end());
+  auto *A2 = *ASpec;
+
+  // Their type names follow the records.
+  QualType A1RecordTy = Context.getRecordType(A1);
+  EXPECT_EQ(getFullyQualifiedName(A1RecordTy), "A<1>");
+  QualType A2RecordTy = Context.getRecordType(A2);
+  EXPECT_EQ(getFullyQualifiedName(A2RecordTy), "A<2U>");
+
+  // getTemplateSpecializationType() gives types that print the integral
+  // argument directly.
+  TemplateArgument Args1[] = {
+      {Context, llvm::APSInt::getUnsigned(1u), Context.UnsignedIntTy}};
+  QualType A1TemplateSpecTy =
+      Context.getTemplateSpecializationType(TemplateName(A), Args1, A1RecordTy);
+  EXPECT_EQ(A1TemplateSpecTy.getAsString(), "A<1>");
+
+  TemplateArgument Args2[] = {
+      {Context, llvm::APSInt::getUnsigned(2u), Context.UnsignedIntTy}};
+  QualType A2TemplateSpecTy =
+      Context.getTemplateSpecializationType(TemplateName(A), Args2, A2RecordTy);
+  EXPECT_EQ(A2TemplateSpecTy.getAsString(), "A<2>");
+
+  // Find A<1>::B and its specialization B<3>.
+  auto *A1B =
+      A1->lookup(&Context.Idents.get("B")).find_first<ClassTemplateDecl>();
+  ASSERT_NE(A1B, nullptr);
+  auto A1BSpec = A1B->spec_begin();
+  ASSERT_NE(A1BSpec, A1B->spec_end());
+  auto *A1B3 = *A1BSpec;
+  QualType A1B3RecordTy = Context.getRecordType(A1B3);
+  EXPECT_EQ(getFullyQualifiedName(A1B3RecordTy), "A<1>::B<3>");
+
+  // Construct A<1>::B<3> and check name.
+  TemplateArgument Args3[] = {
+      {Context, llvm::APSInt::getUnsigned(3u), Context.UnsignedIntTy}};
+  QualType A1B3TemplateSpecTy = Context.getTemplateSpecializationType(
+      TemplateName(A1B), Args3, A1B3RecordTy);
+  EXPECT_EQ(A1B3TemplateSpecTy.getAsString(), "B<3>");
+
+  NestedNameSpecifier *A1Nested = NestedNameSpecifier::Create(
+      Context, nullptr, false, A1TemplateSpecTy.getTypePtr());
+  QualType A1B3ElaboratedTy = Context.getElaboratedType(
+      ElaboratedTypeKeyword::None, A1Nested, A1B3TemplateSpecTy);
+  EXPECT_EQ(A1B3ElaboratedTy.getAsString(), "A<1>::B<3>");
+
+  // Find A<2u>::B and its specialization B<4u>.
+  auto *A2B =
+      A2->lookup(&Context.Idents.get("B")).find_first<ClassTemplateDecl>();
+  ASSERT_NE(A2B, nullptr);
+  auto A2BSpec = A2B->spec_begin();
+  ASSERT_NE(A2BSpec, A2B->spec_end());
+  auto *A2B4 = *A2BSpec;
+  QualType A2B4RecordTy = Context.getRecordType(A2B4);
+  EXPECT_EQ(getFullyQualifiedName(A2B4RecordTy), "A<2U>::B<4U>");
+
+  // Construct A<2>::B<4> and check name.
+  TemplateArgument Args4[] = {
+      {Context, llvm::APSInt::getUnsigned(4u), Context.UnsignedIntTy}};
+  QualType A2B4TemplateSpecTy = Context.getTemplateSpecializationType(
+      TemplateName(A2B), Args4, A2B4RecordTy);
+  EXPECT_EQ(A2B4TemplateSpecTy.getAsString(), "B<4>");
+
+  NestedNameSpecifier *A2Nested = NestedNameSpecifier::Create(
+      Context, nullptr, false, A2TemplateSpecTy.getTypePtr());
+  QualType A2B4ElaboratedTy = Context.getElaboratedType(
+      ElaboratedTypeKeyword::None, A2Nested, A2B4TemplateSpecTy);
+  EXPECT_EQ(A2B4ElaboratedTy.getAsString(), "A<2>::B<4>");
+}
+
 TEST(QualTypeNameTest, AnonStrucs) {
   TypeNameVisitor AnonStrucs;
   AnonStrucs.ExpectedQualTypeNames["a"] = "short";
diff --git a/cmake/Modules/LLVMVersion.cmake b/cmake/Modules/LLVMVersion.cmake
index 7bb6c66a92e12..34900b999a4ae 100644
--- a/cmake/Modules/LLVMVersion.cmake
+++ b/cmake/Modules/LLVMVersion.cmake
@@ -7,7 +7,7 @@ if(NOT DEFINED LLVM_VERSION_MINOR)
   set(LLVM_VERSION_MINOR 1)
 endif()
 if(NOT DEFINED LLVM_VERSION_PATCH)
-  set(LLVM_VERSION_PATCH 4)
+  set(LLVM_VERSION_PATCH 8)
 endif()
 if(NOT DEFINED LLVM_VERSION_SUFFIX)
   set(LLVM_VERSION_SUFFIX)
diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
index 6816119065263..040f501ee52e9 100644
--- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
@@ -21,18 +21,6 @@
 #include "rtsan/rtsan.h"
 
 #if SANITIZER_APPLE
-
-#if TARGET_OS_MAC
-// On MacOS OSSpinLockLock is deprecated and no longer present in the headers,
-// but the symbol still exists on the system. Forward declare here so we
-// don't get compilation errors.
-#include <stdint.h>
-extern "C" {
-typedef int32_t OSSpinLock;
-void OSSpinLockLock(volatile OSSpinLock *__lock);
-}
-#endif // TARGET_OS_MAC
-
 #include <libkern/OSAtomic.h>
 #include <os/lock.h>
 #endif // SANITIZER_APPLE
@@ -627,21 +615,35 @@ INTERCEPTOR(mode_t, umask, mode_t cmask) {
 #pragma clang diagnostic push
 // OSSpinLockLock is deprecated, but still in use in libc++
 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#undef OSSpinLockLock
+
 INTERCEPTOR(void, OSSpinLockLock, volatile OSSpinLock *lock) {
   __rtsan_notify_intercepted_call("OSSpinLockLock");
   return REAL(OSSpinLockLock)(lock);
 }
-#pragma clang diagnostic pop
+
 #define RTSAN_MAYBE_INTERCEPT_OSSPINLOCKLOCK INTERCEPT_FUNCTION(OSSpinLockLock)
 #else
 #define RTSAN_MAYBE_INTERCEPT_OSSPINLOCKLOCK
 #endif // SANITIZER_APPLE
 
+#if SANITIZER_APPLE
+// _os_nospin_lock_lock may replace OSSpinLockLock due to deprecation macro.
+typedef volatile OSSpinLock *_os_nospin_lock_t;
+
+INTERCEPTOR(void, _os_nospin_lock_lock, _os_nospin_lock_t lock) {
+  __rtsan_notify_intercepted_call("_os_nospin_lock_lock");
+  return REAL(_os_nospin_lock_lock)(lock);
+}
+#pragma clang diagnostic pop // "-Wdeprecated-declarations"
+#endif                       // SANITIZER_APPLE
+
 #if SANITIZER_APPLE
 INTERCEPTOR(void, os_unfair_lock_lock, os_unfair_lock_t lock) {
   __rtsan_notify_intercepted_call("os_unfair_lock_lock");
   return REAL(os_unfair_lock_lock)(lock);
 }
+
 #define RTSAN_MAYBE_INTERCEPT_OS_UNFAIR_LOCK_LOCK                              \
   INTERCEPT_FUNCTION(os_unfair_lock_lock)
 #else
diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
index 59663776366bb..7eda884951c83 100644
--- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
@@ -1036,10 +1036,18 @@ TEST(TestRtsanInterceptors, PthreadJoinDiesWhenRealtime) {
 }
 
 #if SANITIZER_APPLE
-
 #pragma clang diagnostic push
 // OSSpinLockLock is deprecated, but still in use in libc++
 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#undef OSSpinLockLock
+extern "C" {
+typedef int32_t OSSpinLock;
+void OSSpinLockLock(volatile OSSpinLock *__lock);
+// _os_nospin_lock_lock may replace OSSpinLockLock due to deprecation macro.
+typedef volatile OSSpinLock *_os_nospin_lock_t;
+void _os_nospin_lock_lock(_os_nospin_lock_t lock);
+}
+
 TEST(TestRtsanInterceptors, OsSpinLockLockDiesWhenRealtime) {
   auto Func = []() {
     OSSpinLock spin_lock{};
@@ -1048,7 +1056,14 @@ TEST(TestRtsanInterceptors, OsSpinLockLockDiesWhenRealtime) {
   ExpectRealtimeDeath(Func, "OSSpinLockLock");
   ExpectNonRealtimeSurvival(Func);
 }
-#pragma clang diagnostic pop
+
+TEST(TestRtsanInterceptors, OsNoSpinLockLockDiesWhenRealtime) {
+  OSSpinLock lock{};
+  auto Func = [&]() { _os_nospin_lock_lock(&lock); };
+  ExpectRealtimeDeath(Func, "_os_nospin_lock_lock");
+  ExpectNonRealtimeSurvival(Func);
+}
+#pragma clang diagnostic pop //"-Wdeprecated-declarations"
 
 TEST(TestRtsanInterceptors, OsUnfairLockLockDiesWhenRealtime) {
   auto Func = []() {
@@ -1058,7 +1073,7 @@ TEST(TestRtsanInterceptors, OsUnfairLockLockDiesWhenRealtime) {
   ExpectRealtimeDeath(Func, "os_unfair_lock_lock");
   ExpectNonRealtimeSurvival(Func);
 }
-#endif
+#endif // SANITIZER_APPLE
 
 #if SANITIZER_LINUX
 TEST(TestRtsanInterceptors, SpinLockLockDiesWhenRealtime) {
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc
index 49ec4097c900b..dda11daa77f49 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc
@@ -338,17 +338,9 @@ static void ioctl_table_fill() {
   _(SOUND_PCM_WRITE_CHANNELS, WRITE, sizeof(int));
   _(SOUND_PCM_WRITE_FILTER, WRITE, sizeof(int));
   _(TCFLSH, NONE, 0);
-#if SANITIZER_GLIBC
-  _(TCGETA, WRITE, struct_termio_sz);
-#endif
   _(TCGETS, WRITE, struct_termios_sz);
   _(TCSBRK, NONE, 0);
   _(TCSBRKP, NONE, 0);
-#if SANITIZER_GLIBC
-  _(TCSETA, READ, struct_termio_sz);
-  _(TCSETAF, READ, struct_termio_sz);
-  _(TCSETAW, READ, struct_termio_sz);
-#endif
   _(TCSETS, READ, struct_termios_sz);
   _(TCSETSF, READ, struct_termios_sz);
   _(TCSETSW, READ, struct_termios_sz);
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
index 538f0570e131b..72256519132ff 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
@@ -86,6 +86,10 @@
 #    include <sys/sysmacros.h>
 #  endif
 
+#  if SANITIZER_LINUX && defined(__powerpc64__)
+#    include <asm/ptrace.h>
+#  endif
+
 #  if SANITIZER_FREEBSD
 #    include <machine/atomic.h>
 #    include <sys/exec.h>
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
index 4d7a15ced8183..57706f74fa7f5 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
@@ -619,21 +619,22 @@ static void GetTls(uptr *addr, uptr *size) {
   *addr = tp - RoundUpTo(*size, align);
   *size = tp - *addr + ThreadDescriptorSize();
 #      else
-  if (SANITIZER_GLIBC)
-    *size += 1664;
-  else if (SANITIZER_FREEBSD)
-    *size += 128;  // RTLD_STATIC_TLS_EXTRA
-#        if defined(__mips__) || defined(__powerpc64__) || SANITIZER_RISCV64
+#        if SANITIZER_GLIBC
+  *size += 1664;
+#        elif SANITIZER_FREEBSD
+  *size += 128;  // RTLD_STATIC_TLS_EXTRA
+#          if defined(__mips__) || defined(__powerpc64__) || SANITIZER_RISCV64
   const uptr pre_tcb_size = TlsPreTcbSize();
   *addr -= pre_tcb_size;
   *size += pre_tcb_size;
-#        else
+#          else
   // arm and aarch64 reserve two words at TP, so this underestimates the range.
   // However, this is sufficient for the purpose of finding the pointers to
   // thread-specific data keys.
   const uptr tcb_size = ThreadDescriptorSize();
   *addr -= tcb_size;
   *size += tcb_size;
+#          endif
 #        endif
 #      endif
 #    elif SANITIZER_NETBSD
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
index 0fad072ac5cd1..d994d39913dd8 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
@@ -96,7 +96,7 @@
 # include <sys/ptrace.h>
 #    if defined(__mips64) || defined(__aarch64__) || defined(__arm__) ||       \
         defined(__hexagon__) || defined(__loongarch__) || SANITIZER_RISCV64 || \
-        defined(__sparc__)
+        defined(__sparc__) || defined(__powerpc64__)
 #      include <asm/ptrace.h>
 #      ifdef __arm__
 typedef struct user_fpregs elf_fpregset_t;
@@ -488,9 +488,6 @@ unsigned struct_ElfW_Phdr_sz = sizeof(Elf_Phdr);
   unsigned struct_input_id_sz = sizeof(struct input_id);
   unsigned struct_mtpos_sz = sizeof(struct mtpos);
   unsigned struct_rtentry_sz = sizeof(struct rtentry);
-#if SANITIZER_GLIBC || SANITIZER_ANDROID
-  unsigned struct_termio_sz = sizeof(struct termio);
-#endif
   unsigned struct_vt_consize_sz = sizeof(struct vt_consize);
   unsigned struct_vt_sizes_sz = sizeof(struct vt_sizes);
   unsigned struct_vt_stat_sz = sizeof(struct vt_stat);
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
index aa05c4f577791..15a20cf3f5ad4 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
@@ -1059,7 +1059,6 @@ extern unsigned struct_hd_geometry_sz;
 extern unsigned struct_input_absinfo_sz;
 extern unsigned struct_input_id_sz;
 extern unsigned struct_mtpos_sz;
-extern unsigned struct_termio_sz;
 extern unsigned struct_vt_consize_sz;
 extern unsigned struct_vt_sizes_sz;
 extern unsigned struct_vt_stat_sz;
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp
index 945da99d41f4e..58d17d90c343a 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp
@@ -31,7 +31,8 @@
 #include <sys/types.h> // for pid_t
 #include <sys/uio.h> // for iovec
 #include <elf.h> // for NT_PRSTATUS
-#if (defined(__aarch64__) || SANITIZER_RISCV64 || SANITIZER_LOONGARCH64) && \
+#if (defined(__aarch64__) || defined(__powerpc64__) || \
+     SANITIZER_RISCV64 || SANITIZER_LOONGARCH64) &&    \
      !SANITIZER_ANDROID
 // GLIBC 2.20+ sys/user does not include asm/ptrace.h
 # include <asm/ptrace.h>
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index 5c26469b9fa24..8da9252133bdc 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -359,7 +359,10 @@ void CheckHelper::Check(const Symbol &symbol) {
       // are not pertinent to the characteristics of the procedure.
       // Restrictions on entities in pure procedure interfaces don't need
       // enforcement.
-    } else if (!FindCommonBlockContaining(symbol) && IsSaved(symbol)) {
+    } else if (symbol.has<AssocEntityDetails>() ||
+        FindCommonBlockContaining(symbol)) {
+      // can look like they have SAVE but are fine in PURE
+    } else if (IsSaved(symbol)) {
       if (IsInitialized(symbol)) {
         messages_.Say(
             "A pure subprogram may not initialize a variable"_err_en_US);
diff --git a/flang/test/Semantics/call10.f90 b/flang/test/Semantics/call10.f90
index 2d2f57934cd8a..1e186f7b4048a 100644
--- a/flang/test/Semantics/call10.f90
+++ b/flang/test/Semantics/call10.f90
@@ -36,6 +36,8 @@ pure subroutine s05a
     end subroutine
   end interface
 
+  real :: moduleVar = 1.
+
  contains
 
   subroutine impure(x)
@@ -117,6 +119,8 @@ pure subroutine s05 ! C1589
     !ERROR: A pure subprogram may not initialize a variable
       real :: v6 = 0.
     end block
+    associate (x => moduleVar) ! ok
+    end associate
   end subroutine
   pure subroutine s06 ! C1589
     !ERROR: A pure subprogram may not have a variable with the VOLATILE attribute
diff --git a/libclc/clc/lib/spirv/SOURCES b/libclc/clc/lib/spirv/SOURCES
index 509236d587cd0..f97a1407f6631 100644
--- a/libclc/clc/lib/spirv/SOURCES
+++ b/libclc/clc/lib/spirv/SOURCES
@@ -10,5 +10,6 @@
 ../generic/math/clc_nextafter.cl
 ../generic/math/clc_rint.cl
 ../generic/math/clc_trunc.cl
+../generic/relational/clc_isnan.cl
 ../generic/relational/clc_select.cl
 ../generic/shared/clc_clamp.cl
diff --git a/libclc/clc/lib/spirv64/SOURCES b/libclc/clc/lib/spirv64/SOURCES
index 509236d587cd0..f97a1407f6631 100644
--- a/libclc/clc/lib/spirv64/SOURCES
+++ b/libclc/clc/lib/spirv64/SOURCES
@@ -10,5 +10,6 @@
 ../generic/math/clc_nextafter.cl
 ../generic/math/clc_rint.cl
 ../generic/math/clc_trunc.cl
+../generic/relational/clc_isnan.cl
 ../generic/relational/clc_select.cl
 ../generic/shared/clc_clamp.cl
diff --git a/libcxx/docs/ReleaseNotes/20.rst b/libcxx/docs/ReleaseNotes/20.rst
index 06e6e673b5508..f81a573845e6f 100644
--- a/libcxx/docs/ReleaseNotes/20.rst
+++ b/libcxx/docs/ReleaseNotes/20.rst
@@ -153,9 +153,6 @@ Deprecations and Removals
   headers as an extension and only deprecates them. The ``_LIBCPP_DISABLE_DEPRECATION_WARNINGS`` macro can be defined to
   suppress deprecation for these headers.
 
-- The ``_LIBCPP_DISABLE_AVAILABILITY`` macro that was used to force-disable availability markup has now been removed.
-  Whether availability markup is used by the library is now solely controlled at configuration-time.
-
 - The pointer safety functions ``declare_reachable``, ``declare_no_pointers``, ``undeclare_no_pointers`` and
   ``__undeclare_reachable`` have been removed from the library. These functions were never implemented in a non-trivial
   way, making it very unlikely that any binary depends on them.
diff --git a/libcxx/include/__configuration/availability.h b/libcxx/include/__configuration/availability.h
index f9e52a690c05c..aa2e75b6f6fe8 100644
--- a/libcxx/include/__configuration/availability.h
+++ b/libcxx/include/__configuration/availability.h
@@ -69,7 +69,13 @@
 
 // Availability markup is disabled when building the library, or when a non-Clang
 // compiler is used because only Clang supports the necessary attributes.
-#if defined(_LIBCPP_BUILDING_LIBRARY) || defined(_LIBCXXABI_BUILDING_LIBRARY) || !defined(_LIBCPP_COMPILER_CLANG_BASED)
+//
+// We also allow users to force-disable availability markup via the `_LIBCPP_DISABLE_AVAILABILITY`
+// macro because that is the only way to work around a Clang bug related to availability
+// attributes: https://github.com/llvm/llvm-project/issues/134151.
+// Once that bug has been fixed, we should remove the macro.
+#if defined(_LIBCPP_BUILDING_LIBRARY) || defined(_LIBCXXABI_BUILDING_LIBRARY) ||                                       \
+    !defined(_LIBCPP_COMPILER_CLANG_BASED) || defined(_LIBCPP_DISABLE_AVAILABILITY)
 #  undef _LIBCPP_HAS_VENDOR_AVAILABILITY_ANNOTATIONS
 #  define _LIBCPP_HAS_VENDOR_AVAILABILITY_ANNOTATIONS 0
 #endif
diff --git a/libcxx/include/__flat_map/flat_map.h b/libcxx/include/__flat_map/flat_map.h
index a0594ed9dc411..9cc39c0a1e067 100644
--- a/libcxx/include/__flat_map/flat_map.h
+++ b/libcxx/include/__flat_map/flat_map.h
@@ -113,7 +113,7 @@ class flat_map {
 
   class value_compare {
   private:
-    key_compare __comp_;
+    _LIBCPP_NO_UNIQUE_ADDRESS key_compare __comp_;
     _LIBCPP_HIDE_FROM_ABI value_compare(key_compare __c) : __comp_(__c) {}
     friend flat_map;
 
diff --git a/libcxx/include/__flat_map/flat_multimap.h b/libcxx/include/__flat_map/flat_multimap.h
index ea77fb5d79bd2..15fcd7995ad0a 100644
--- a/libcxx/include/__flat_map/flat_multimap.h
+++ b/libcxx/include/__flat_map/flat_multimap.h
@@ -115,7 +115,7 @@ class flat_multimap {
 
   class value_compare {
   private:
-    key_compare __comp_;
+    _LIBCPP_NO_UNIQUE_ADDRESS key_compare __comp_;
     _LIBCPP_HIDE_FROM_ABI value_compare(key_compare __c) : __comp_(__c) {}
     friend flat_multimap;
 
diff --git a/libcxx/test/libcxx/vendor/apple/disable-availability.sh.cpp b/libcxx/test/libcxx/vendor/apple/disable-availability.sh.cpp
new file mode 100644
index 0000000000000..474b3f83c6044
--- /dev/null
+++ b/libcxx/test/libcxx/vendor/apple/disable-availability.sh.cpp
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: stdlib=apple-libc++
+
+// This test is dependent on the code generated by the compiler, and it doesn't
+// work properly with older AppleClangs.
+// UNSUPPORTED: apple-clang-15
+
+// This test ensures that we retain a way to disable availability markup on Apple platforms
+// in order to work around Clang bug https://github.com/llvm/llvm-project/issues/134151.
+//
+// Once that bug has been fixed or once we've made changes to libc++'s use of availability
+// that render that workaround unnecessary, the macro and this test can be removed.
+//
+// The test works by creating a final linked image that refers to a function marked with
+// both an availability attribute and with _LIBCPP_HIDE_FROM_ABI. We then check that this
+// generates a weak reference to the function -- without the bug, we'd expect a strong
+// reference or no reference at all instead.
+
+// First, test the test. Make sure that we do (incorrectly) produce a weak definition when we
+// don't define _LIBCPP_DISABLE_AVAILABILITY. Otherwise, something may have changed in libc++
+// and this test might not work anymore.
+// RUN: %{cxx} %s %{flags} %{compile_flags} %{link_flags} -fvisibility=hidden -fvisibility-inlines-hidden -shared -o %t.1.dylib
+// RUN: nm -m %t.1.dylib | c++filt | grep value > %t.1.symbols
+// RUN: grep weak %t.1.symbols
+
+// Now, make sure that 'weak' goes away when we define _LIBCPP_DISABLE_AVAILABILITY.
+// In fact, all references to the function might go away, so we just check that we don't emit
+// any weak reference.
+// RUN: %{cxx} %s %{flags} %{compile_flags} %{link_flags} -fvisibility=hidden -fvisibility-inlines-hidden -D_LIBCPP_DISABLE_AVAILABILITY -shared -o %t.2.dylib
+// RUN: nm -m %t.2.dylib | c++filt | grep value > %t.2.symbols
+// RUN: not grep weak %t.2.symbols
+
+#include <version>
+
+template <class T>
+struct optional {
+  T val_;
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_INTRODUCED_IN_LLVM_11_ATTRIBUTE T value() const { return val_; }
+};
+
+using PMF = int (optional<int>::*)() const;
+PMF f() { return &optional<int>::value; }
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp
index bbb67d694970a..f02241ad36a5b 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp
@@ -13,6 +13,8 @@
 
 // REQUIRES: locale.fr_FR.UTF-8
 
+// ADDITIONAL_COMPILE_FLAGS: -DFR_MON_THOU_SEP=%{LOCALE_CONV_FR_FR_UTF_8_MON_THOUSANDS_SEP}
+
 // <locale>
 
 // class money_get<charT, InputIterator>
@@ -59,7 +61,8 @@ class my_facetw
 };
 
 static std::wstring convert_thousands_sep(std::wstring const& in) {
-  return LocaleHelpers::convert_thousands_sep_fr_FR(in);
+  const wchar_t fr_sep = LocaleHelpers::mon_thousands_sep_or_default(FR_MON_THOU_SEP);
+  return LocaleHelpers::convert_thousands_sep(in, fr_sep);
 }
 #endif // TEST_HAS_NO_WIDE_CHARACTERS
 
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_ru_RU.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_ru_RU.pass.cpp
index e680f2ea8816a..371cf0e90c8d3 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_ru_RU.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_ru_RU.pass.cpp
@@ -11,6 +11,8 @@
 
 // REQUIRES: locale.ru_RU.UTF-8
 
+// ADDITIONAL_COMPILE_FLAGS: -DRU_MON_THOU_SEP=%{LOCALE_CONV_RU_RU_UTF_8_MON_THOUSANDS_SEP}
+
 // XFAIL: glibc-old-ru_RU-decimal-point
 
 // <locale>
@@ -52,7 +54,8 @@ class my_facetw
 };
 
 static std::wstring convert_thousands_sep(std::wstring const& in) {
-  return LocaleHelpers::convert_thousands_sep_ru_RU(in);
+  const wchar_t ru_sep = LocaleHelpers::mon_thousands_sep_or_default(RU_MON_THOU_SEP);
+  return LocaleHelpers::convert_thousands_sep(in, ru_sep);
 }
 #endif // TEST_HAS_NO_WIDE_CHARACTERS
 
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp
index 47a48deb3368c..9ac95cc52ac07 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp
@@ -13,6 +13,8 @@
 
 // REQUIRES: locale.fr_FR.UTF-8
 
+// ADDITIONAL_COMPILE_FLAGS: -DFR_MON_THOU_SEP=%{LOCALE_CONV_FR_FR_UTF_8_MON_THOUSANDS_SEP}
+
 // <locale>
 
 // class money_put<charT, OutputIterator>
@@ -59,7 +61,8 @@ class my_facetw
 };
 
 static std::wstring convert_thousands_sep(std::wstring const& in) {
-  return LocaleHelpers::convert_thousands_sep_fr_FR(in);
+  const wchar_t fr_sep = LocaleHelpers::mon_thousands_sep_or_default(FR_MON_THOU_SEP);
+  return LocaleHelpers::convert_thousands_sep(in, fr_sep);
 }
 #endif // TEST_HAS_NO_WIDE_CHARACTERS
 
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_ru_RU.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_ru_RU.pass.cpp
index 4aea1016e735b..be1e397488468 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_ru_RU.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_ru_RU.pass.cpp
@@ -11,6 +11,8 @@
 
 // REQUIRES: locale.ru_RU.UTF-8
 
+// ADDITIONAL_COMPILE_FLAGS: -DRU_MON_THOU_SEP=%{LOCALE_CONV_RU_RU_UTF_8_MON_THOUSANDS_SEP}
+
 // XFAIL: glibc-old-ru_RU-decimal-point
 
 // <locale>
@@ -52,7 +54,8 @@ class my_facetw
 };
 
 static std::wstring convert_thousands_sep(std::wstring const& in) {
-  return LocaleHelpers::convert_thousands_sep_ru_RU(in);
+  const wchar_t ru_sep = LocaleHelpers::mon_thousands_sep_or_default(RU_MON_THOU_SEP);
+  return LocaleHelpers::convert_thousands_sep(in, ru_sep);
 }
 #endif // TEST_HAS_NO_WIDE_CHARACTERS
 
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/thousands_sep.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/thousands_sep.pass.cpp
index 2a70741d2a0fa..6b6570576a082 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/thousands_sep.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/thousands_sep.pass.cpp
@@ -9,13 +9,14 @@
 // NetBSD does not support LC_MONETARY at the moment
 // XFAIL: netbsd
 
-// XFAIL: LIBCXX-FREEBSD-FIXME
-
 // REQUIRES: locale.en_US.UTF-8
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ru_RU.UTF-8
 // REQUIRES: locale.zh_CN.UTF-8
 
+// ADDITIONAL_COMPILE_FLAGS: -DFR_MON_THOU_SEP=%{LOCALE_CONV_FR_FR_UTF_8_MON_THOUSANDS_SEP}
+// ADDITIONAL_COMPILE_FLAGS: -DRU_MON_THOU_SEP=%{LOCALE_CONV_RU_RU_UTF_8_MON_THOUSANDS_SEP}
+
 // <locale>
 
 // class moneypunct_byname<charT, International>
@@ -27,6 +28,7 @@
 #include <cassert>
 
 #include "test_macros.h"
+#include "locale_helpers.h"
 #include "platform_support.h" // locale name macros
 
 class Fnf
@@ -110,17 +112,10 @@ int main(int, char**)
         Fnt f(LOCALE_fr_FR_UTF_8, 1);
         assert(f.thousands_sep() == ' ');
     }
-    // The below tests work around GLIBC's use of U202F as mon_thousands_sep.
+
 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
-#if defined(_CS_GNU_LIBC_VERSION)
-    const wchar_t fr_sep = glibc_version_less_than("2.27") ? L' ' : L'\u202F';
-#elif defined(_WIN32)
-    const wchar_t fr_sep = L'\u00A0';
-#elif defined(_AIX)
-    const wchar_t fr_sep = L'\u202F';
-#else
-    const wchar_t fr_sep = L' ';
-#endif
+    const wchar_t fr_sep = LocaleHelpers::mon_thousands_sep_or_default(FR_MON_THOU_SEP);
+
     {
         Fwf f(LOCALE_fr_FR_UTF_8, 1);
         assert(f.thousands_sep() == fr_sep);
@@ -140,19 +135,8 @@ int main(int, char**)
         assert(f.thousands_sep() == sep);
     }
 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
-    // The below tests work around GLIBC's use of U00A0 as mon_thousands_sep
-    // and U002E as mon_decimal_point.
-    // TODO: Fix thousands_sep for 'char'.
-    // related to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=16006
-#   if defined(_CS_GNU_LIBC_VERSION)
-    // FIXME libc++ specifically works around \u00A0 by translating it into
-    // a regular space.
-    const wchar_t wsep = glibc_version_less_than("2.27") ? L'\u00A0' : L'\u202F';
-#   elif defined(_WIN32) || defined(_AIX)
-    const wchar_t wsep = L'\u00A0';
-#   else
-    const wchar_t wsep = L' ';
-#   endif
+    const wchar_t wsep = LocaleHelpers::mon_thousands_sep_or_default(RU_MON_THOU_SEP);
+
     {
         Fwf f(LOCALE_ru_RU_UTF_8, 1);
         assert(f.thousands_sep() == wsep);
diff --git a/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/thousands_sep.pass.cpp b/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/thousands_sep.pass.cpp
index 850352b3bc1ec..ccecd85f2ff87 100644
--- a/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/thousands_sep.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/thousands_sep.pass.cpp
@@ -14,6 +14,8 @@
 // REQUIRES: locale.en_US.UTF-8
 // REQUIRES: locale.fr_FR.UTF-8
 
+// ADDITIONAL_COMPILE_FLAGS: -DFR_THOU_SEP=%{LOCALE_CONV_FR_FR_UTF_8_THOUSANDS_SEP}
+
 // <locale>
 
 // template <class charT> class numpunct_byname;
@@ -25,6 +27,7 @@
 #include <cassert>
 
 #include "test_macros.h"
+#include "locale_helpers.h"
 #include "platform_support.h" // locale name macros
 
 int main(int, char**)
@@ -74,18 +77,11 @@ int main(int, char**)
         }
 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
         {
-#if defined(_CS_GNU_LIBC_VERSION)
-            const wchar_t wsep = glibc_version_less_than("2.27") ? L' ' : L'\u202f';
-#  elif defined(_AIX)
-            const wchar_t wsep = L'\u202F';
-#  elif defined(_WIN32)
-            const wchar_t wsep = L'\u00A0';
-#  else
-            const wchar_t wsep = L',';
-#  endif
-            typedef wchar_t C;
-            const std::numpunct<C>& np = std::use_facet<std::numpunct<C> >(l);
-            assert(np.thousands_sep() == wsep);
+          const wchar_t wsep = LocaleHelpers::thousands_sep_or_default(FR_THOU_SEP);
+
+          typedef wchar_t C;
+          const std::numpunct<C>& np = std::use_facet<std::numpunct<C> >(l);
+          assert(np.thousands_sep() == wsep);
         }
 #endif // TEST_HAS_NO_WIDE_CHARACTERS
     }
diff --git a/libcxx/test/std/time/time.duration/time.duration.nonmember/ostream.pass.cpp b/libcxx/test/std/time/time.duration/time.duration.nonmember/ostream.pass.cpp
index aecb96b58719e..ebf907a49c43e 100644
--- a/libcxx/test/std/time/time.duration/time.duration.nonmember/ostream.pass.cpp
+++ b/libcxx/test/std/time/time.duration/time.duration.nonmember/ostream.pass.cpp
@@ -16,6 +16,9 @@
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
+// ADDITIONAL_COMPILE_FLAGS: -DFR_THOU_SEP=%{LOCALE_CONV_FR_FR_UTF_8_THOUSANDS_SEP}
+// ADDITIONAL_COMPILE_FLAGS: -DFR_DEC_POINT=%{LOCALE_CONV_FR_FR_UTF_8_DECIMAL_POINT}
+
 // <chrono>
 
 // template<class Rep, class Period = ratio<1>> class duration;
@@ -33,6 +36,7 @@
 #include <sstream>
 
 #include "make_string.h"
+#include "locale_helpers.h"
 #include "platform_support.h" // locale name macros
 #include "test_macros.h"
 
@@ -88,21 +92,11 @@ static void test_values() {
     assert(stream_fr_FR_locale<CharT>(1'000.123456s) == SV("1 000,1235s"));
 #endif
   } else {
-#ifdef _WIN32
-    assert(stream_fr_FR_locale<CharT>(-1'000'000s) == SV("-1\u00A0000\u00A0000s"));
-    assert(stream_fr_FR_locale<CharT>(1'000'000s) == SV("1\u00A0000\u00A0000s"));
-    assert(stream_fr_FR_locale<CharT>(-1'000.123456s) == SV("-1\u00A0000,1235s"));
-    assert(stream_fr_FR_locale<CharT>(1'000.123456s) == SV("1\u00A0000,1235s"));
-#elif defined(__APPLE__)
-    assert(stream_fr_FR_locale<CharT>(-1'000'000s) == SV("-1000000s"));
-    assert(stream_fr_FR_locale<CharT>(1'000'000s) == SV("1000000s"));
-    assert(stream_fr_FR_locale<CharT>(-1'000.123456s) == SV("-1000,1235s"));
-    assert(stream_fr_FR_locale<CharT>(1'000.123456s) == SV("1000,1235s"));
-#else
-    assert(stream_fr_FR_locale<CharT>(-1'000'000s) == SV("-1\u202f000\u202f000s"));
-    assert(stream_fr_FR_locale<CharT>(1'000'000s) == SV("1\u202f000\u202f000s"));
-    assert(stream_fr_FR_locale<CharT>(-1'000.123456s) == SV("-1\u202f000,1235s"));
-    assert(stream_fr_FR_locale<CharT>(1'000.123456s) == SV("1\u202f000,1235s"));
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+    assert(stream_fr_FR_locale<CharT>(-1'000'000s) == L"-1" FR_THOU_SEP "000" FR_THOU_SEP "000s");
+    assert(stream_fr_FR_locale<CharT>(1'000'000s) == L"1" FR_THOU_SEP "000" FR_THOU_SEP "000s");
+    assert(stream_fr_FR_locale<CharT>(-1'000.123456s) == L"-1" FR_THOU_SEP "000" FR_DEC_POINT "1235s");
+    assert(stream_fr_FR_locale<CharT>(1'000.123456s) == L"1" FR_THOU_SEP "000" FR_DEC_POINT "1235s");
 #endif
   }
 
diff --git a/libcxx/test/support/locale_helpers.h b/libcxx/test/support/locale_helpers.h
index 3eb24ebf28f52..946c2fed0f3a5 100644
--- a/libcxx/test/support/locale_helpers.h
+++ b/libcxx/test/support/locale_helpers.h
@@ -41,37 +41,6 @@ std::wstring convert_thousands_sep(std::wstring const& in, wchar_t sep) {
   return out;
 }
 
-// GLIBC 2.27 and newer use U+202F NARROW NO-BREAK SPACE as a thousands separator.
-// This function converts the spaces in string inputs to U+202F if need
-// be. FreeBSD's locale data also uses U+202F, since 2018.
-// Windows uses U+00A0 NO-BREAK SPACE.
-std::wstring convert_thousands_sep_fr_FR(std::wstring const& in) {
-#if defined(_CS_GNU_LIBC_VERSION)
-  if (glibc_version_less_than("2.27"))
-    return in;
-  else
-    return convert_thousands_sep(in, L'\u202F');
-#elif defined(__FreeBSD__)
-  return convert_thousands_sep(in, L'\u202F');
-#elif defined(_WIN32)
-  return convert_thousands_sep(in, L'\u00A0');
-#else
-  return in;
-#endif
-}
-
-// GLIBC 2.27 uses U+202F NARROW NO-BREAK SPACE as a thousands separator.
-// FreeBSD, AIX and Windows use U+00A0 NO-BREAK SPACE.
-std::wstring convert_thousands_sep_ru_RU(std::wstring const& in) {
-#if defined(TEST_HAS_GLIBC)
-  return convert_thousands_sep(in, L'\u202F');
-#  elif defined(__FreeBSD__) || defined(_WIN32) || defined(_AIX)
-  return convert_thousands_sep(in, L'\u00A0');
-#  else
-  return in;
-#  endif
-}
-
 std::wstring negate_en_US(std::wstring s) {
 #if defined(_WIN32)
   return L"(" + s + L")";
@@ -80,6 +49,12 @@ std::wstring negate_en_US(std::wstring s) {
 #endif
 }
 
+wchar_t thousands_sep_or_default(std::wstring s) { return !s.empty() ? s[0] : L','; }
+
+wchar_t mon_thousands_sep_or_default(std::wstring s) { return thousands_sep_or_default(s); }
+
+wchar_t decimal_point_or_default(std::wstring s) { return !s.empty() ? s[0] : L'.'; }
+
 #endif // TEST_HAS_NO_WIDE_CHARACTERS
 
 std::string negate_en_US(std::string s) {
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index e4b413deff4db..10fc4b0afde6b 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -425,6 +425,10 @@ def _mingwSupportsModules(cfg):
     "fr_CA.ISO8859-1": ["fr_CA.ISO8859-1", "French_Canada.1252"],
     "cs_CZ.ISO8859-2": ["cs_CZ.ISO8859-2", "Czech_Czech Republic.1250"],
 }
+provide_locale_conversions = {
+    "fr_FR.UTF-8": ["decimal_point", "mon_thousands_sep", "thousands_sep"],
+    "ru_RU.UTF-8": ["mon_thousands_sep"],
+}
 for locale, alts in locales.items():
     # Note: Using alts directly in the lambda body here will bind it to the value at the
     # end of the loop. Assigning it to a default argument works around this issue.
@@ -432,10 +436,96 @@ def _mingwSupportsModules(cfg):
         Feature(
             name="locale.{}".format(locale),
             when=lambda cfg, alts=alts: hasAnyLocale(cfg, alts),
-        )
+            actions=lambda cfg, locale=locale, alts=alts: _getLocaleFlagsAction(
+                cfg, locale, alts, provide_locale_conversions[locale]
+            )
+            if locale in provide_locale_conversions
+            and ("_LIBCPP_HAS_WIDE_CHARACTERS" not in compilerMacros(cfg) or
+                 compilerMacros(cfg)["_LIBCPP_HAS_WIDE_CHARACTERS"] == "1")
+            else [],
+        ),
     )
 
 
+# Provide environment locale conversions through substitutions to avoid platform specific
+# maintenance.
+def _getLocaleFlagsAction(cfg, locale, alts, members):
+    alts_list = ",".join([f'"{l}"' for l in alts])
+    get_member_list = ",".join([f"lc->{m}" for m in members])
+
+    localeconv_info = programOutput(
+        cfg,
+        r"""
+        #if defined(_WIN32) && !defined(_CRT_SECURE_NO_WARNINGS)
+        #define _CRT_SECURE_NO_WARNINGS
+        #endif
+        #include <stdio.h>
+        #include <locale.h>
+        #include <stdlib.h>
+        #include <wchar.h>
+
+        // Print each requested locale conversion member on separate lines.
+        int main() {
+          const char* locales[] = { %s };
+          for (int loc_i = 0; loc_i < %d; ++loc_i) {
+            if (!setlocale(LC_ALL, locales[loc_i])) {
+              continue; // Choose first locale name that is recognized.
+            }
+
+            lconv* lc = localeconv();
+            const char* members[] = { %s };
+            for (size_t m_i = 0; m_i < %d; ++m_i) {
+              if (!members[m_i]) {
+                printf("\n"); // member value is an empty string
+                continue;
+              }
+
+              size_t len = mbstowcs(nullptr, members[m_i], 0);
+              if (len == static_cast<size_t>(-1)) {
+                fprintf(stderr, "mbstowcs failed unexpectedly\n");
+                return 1;
+              }
+              // Include room for null terminator. Use malloc as these features
+              // are also used by lit configs that don't use -lc++ (libunwind tests).
+              wchar_t* dst = (wchar_t*)malloc((len + 1) * sizeof(wchar_t));
+              size_t ret = mbstowcs(dst, members[m_i], len + 1);
+              if (ret == static_cast<size_t>(-1)) {
+                fprintf(stderr, "mbstowcs failed unexpectedly\n");
+                free(dst);
+                return 1;
+              }
+
+              for (size_t i = 0; i < len; ++i) {
+                if (dst[i] > 0x7F) {
+                  printf("\\u%%04x", dst[i]);
+                } else {
+                  // c++03 does not allow basic ascii-range characters in UCNs
+                  printf("%%c", (char)dst[i]);
+                }
+              }
+              printf("\n");
+              free(dst);
+            }
+            return 0;
+          }
+
+          return 1;
+        }
+        """
+        % (alts_list, len(alts), get_member_list, len(members)),
+    )
+    valid_define_name = re.sub(r"[.-]", "_", locale).upper()
+    return [
+        # Provide locale conversion through a substitution.
+        # Example: %{LOCALE_CONV_FR_FR_UTF_8_THOUSANDS_SEP} = L"\u202f"
+        AddSubstitution(
+            f"%{{LOCALE_CONV_{valid_define_name}_{member.upper()}}}",
+            lambda cfg, value=value: f"'L\"{value}\"'",
+        )
+        for member, value in zip(members, localeconv_info.split("\n"))
+    ]
+
+
 # Add features representing the target platform name: darwin, linux, windows, etc...
 DEFAULT_FEATURES += [
     Feature(name="darwin", when=lambda cfg: "__APPLE__" in compilerMacros(cfg)),
diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index ac3ac57bd17f4..a669b7e9296f6 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -492,6 +492,12 @@ void LinkerDriver::parseDirectives(InputFile *file) {
     case OPT_alternatename:
       parseAlternateName(arg->getValue());
       break;
+    case OPT_arm64xsameaddress:
+      if (!file->symtab.isEC())
+        Warn(ctx) << arg->getSpelling()
+                  << " is not allowed in non-ARM64EC files (" << toString(file)
+                  << ")";
+      break;
     case OPT_defaultlib:
       if (std::optional<StringRef> path = findLibIfNew(arg->getValue()))
         enqueuePath(*path, false, false);
@@ -2639,10 +2645,8 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
     createECExportThunks();
 
   // Resolve remaining undefined symbols and warn about imported locals.
-  ctx.forEachSymtab([&](SymbolTable &symtab) {
-    while (symtab.resolveRemainingUndefines())
-      run();
-  });
+  ctx.forEachSymtab(
+      [&](SymbolTable &symtab) { symtab.resolveRemainingUndefines(); });
 
   if (errorCount())
     return;
diff --git a/lld/COFF/Options.td b/lld/COFF/Options.td
index b6fd3d0daaef9..ea2e7ded38043 100644
--- a/lld/COFF/Options.td
+++ b/lld/COFF/Options.td
@@ -355,3 +355,4 @@ def tlbid : P_priv<"tlbid">;
 def tlbout : P_priv<"tlbout">;
 def verbose_all : P_priv<"verbose">;
 def guardsym : P_priv<"guardsym">;
+def arm64xsameaddress : P_priv<"arm64xsameaddress">;
diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp
index 307bd4a0c9411..a146e5211736e 100644
--- a/lld/COFF/SymbolTable.cpp
+++ b/lld/COFF/SymbolTable.cpp
@@ -214,7 +214,8 @@ struct UndefinedDiag {
   std::vector<File> files;
 };
 
-static void reportUndefinedSymbol(COFFLinkerContext &ctx,
+static void reportUndefinedSymbol(SymbolTable *symTab,
+                                  COFFLinkerContext &ctx,
                                   const UndefinedDiag &undefDiag) {
   auto diag = errorOrWarn(ctx);
   diag << "undefined symbol: " << undefDiag.sym;
@@ -232,6 +233,17 @@ static void reportUndefinedSymbol(COFFLinkerContext &ctx,
   }
   if (numDisplayedRefs < numRefs)
     diag << "\n>>> referenced " << numRefs - numDisplayedRefs << " more times";
+
+  // Hints
+  StringRef name = undefDiag.sym->getName();
+  if (name.consume_front("__imp_")) {
+    Symbol *imp = symTab->find(name);
+    if (imp && imp->isLazy()) {
+      diag << "\nNOTE: a relevant symbol '" << imp->getName()
+           << "' is available in " << toString(imp->getFile())
+           << " but cannot be used because it is not an import library.";
+    }
+  }
 }
 
 void SymbolTable::loadMinGWSymbols() {
@@ -402,7 +414,7 @@ void SymbolTable::reportProblemSymbols(
       processFile(file, file->getSymbols());
 
   for (const UndefinedDiag &undefDiag : undefDiags)
-    reportUndefinedSymbol(ctx, undefDiag);
+    reportUndefinedSymbol(this, ctx, undefDiag);
 }
 
 void SymbolTable::reportUnresolvable() {
@@ -432,11 +444,10 @@ void SymbolTable::reportUnresolvable() {
   reportProblemSymbols(undefs, /*localImports=*/nullptr, true);
 }
 
-bool SymbolTable::resolveRemainingUndefines() {
+void SymbolTable::resolveRemainingUndefines() {
   llvm::TimeTraceScope timeScope("Resolve remaining undefined symbols");
   SmallPtrSet<Symbol *, 8> undefs;
   DenseMap<Symbol *, Symbol *> localImports;
-  bool foundLazy = false;
 
   for (auto &i : symMap) {
     Symbol *sym = i.second;
@@ -481,11 +492,6 @@ bool SymbolTable::resolveRemainingUndefines() {
             imp = findLocalSym(*mangledName);
         }
       }
-      if (imp && imp->isLazy()) {
-        forceLazy(imp);
-        foundLazy = true;
-        continue;
-      }
       if (imp && isa<Defined>(imp)) {
         auto *d = cast<Defined>(imp);
         replaceSymbol<DefinedLocalImport>(sym, ctx, name, d);
@@ -513,7 +519,6 @@ bool SymbolTable::resolveRemainingUndefines() {
   reportProblemSymbols(
       undefs, ctx.config.warnLocallyDefinedImported ? &localImports : nullptr,
       false);
-  return foundLazy;
 }
 
 std::pair<Symbol *, bool> SymbolTable::insert(StringRef name) {
diff --git a/lld/COFF/SymbolTable.h b/lld/COFF/SymbolTable.h
index ff6e8487f0734..2916c23d95c87 100644
--- a/lld/COFF/SymbolTable.h
+++ b/lld/COFF/SymbolTable.h
@@ -58,10 +58,7 @@ class SymbolTable {
   // Try to resolve any undefined symbols and update the symbol table
   // accordingly, then print an error message for any remaining undefined
   // symbols and warn about imported local symbols.
-  // Returns whether more files might need to be linked in to resolve lazy
-  // symbols, in which case the caller is expected to call the function again
-  // after linking those files.
-  bool resolveRemainingUndefines();
+  void resolveRemainingUndefines();
 
   // Load lazy objects that are needed for MinGW automatic import and for
   // doing stdcall fixups.
diff --git a/lld/test/COFF/arm64x-sameaddress.test b/lld/test/COFF/arm64x-sameaddress.test
new file mode 100644
index 0000000000000..c69be9d268c3b
--- /dev/null
+++ b/lld/test/COFF/arm64x-sameaddress.test
@@ -0,0 +1,56 @@
+REQUIRES: aarch64
+RUN: split-file %s %t.dir && cd %t.dir
+
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows func-arm64ec.s -o func-arm64ec.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows func-arm64.s -o func-arm64.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows drectve.s -o drectve.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows drectve.s -o drectve-arm64.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64.s -o loadconfig-arm64.obj
+
+RUN: lld-link -machine:arm64x -dll -noentry -out:out.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \
+RUN:          func-arm64.obj func-arm64ec.obj drectve.obj
+
+RUN: lld-link -machine:arm64x -dll -noentry -out:out-cmd.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \
+RUN:          func-arm64.obj func-arm64ec.obj -arm64xsameaddress:func
+
+RUN: lld-link -machine:arm64ec -dll -noentry -out:out-ec.dll loadconfig-arm64ec.obj func-arm64ec.obj drectve.obj
+
+RUN: lld-link -machine:arm64x -dll -noentry -out:out-warn.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \
+RUN:          func-arm64.obj func-arm64ec.obj drectve-arm64.obj 2>&1 | FileCheck --check-prefix=WARN %s
+WARN: lld-link: warning: -arm64xsameaddress: is not allowed in non-ARM64EC files (drectve-arm64.obj)
+
+#--- func-arm64.s
+        .section .text,"xr",discard,func
+        .globl func
+func:
+        mov x0, #1
+        ret
+
+#--- func-arm64ec.s
+        .section .text,"xr",discard,"#func"
+        .globl "#func"
+"#func":
+        mov x0, #2
+        ret
+
+        .weak_anti_dep func
+        .set func,"#func"
+
+        .section .wowthk,"xr",discard,entry_thunk
+        .globl entry_thunk
+entry_thunk:
+        mov x0, #3
+        ret
+
+        .section .test,"dr"
+        .rva func
+
+	.section .hybmp$x,"yi"
+	.symidx "#func"
+	.symidx entry_thunk
+	.word 1
+
+#--- drectve.s
+        .section .drectve, "yn"
+        .ascii " -arm64xsameaddress:func"
diff --git a/lld/test/COFF/imports-static-lib-indirect.test b/lld/test/COFF/imports-static-lib-indirect.test
new file mode 100644
index 0000000000000..beda0d7a31afd
--- /dev/null
+++ b/lld/test/COFF/imports-static-lib-indirect.test
@@ -0,0 +1,26 @@
+# REQUIRES: x86
+
+# Pulling in on both a dllimport symbol and a static symbol should only warn.
+# RUN: split-file %s %t.dir
+# RUN: llvm-mc --filetype=obj -triple=x86_64-windows-msvc %t.dir/other.s -o %t.other.obj
+# RUN: llvm-mc --filetype=obj -triple=x86_64-windows-msvc %t.dir/main.s -o %t.main.obj
+# RUN: llvm-lib %t.other.obj -out:%t.other.lib
+# RUN: lld-link %t.other.lib %t.main.obj -out:%t.dll -dll 2>&1 | FileCheck %s
+
+CHECK: warning: {{.*}} locally defined symbol imported: foo {{.*}} [LNK4217]
+
+#--- other.s
+.text
+.globl other
+.globl foo
+other:
+  ret
+foo:
+  ret
+#--- main.s
+.text
+.global _DllMainCRTStartup
+_DllMainCRTStartup:
+  call *other(%rip)
+  call *__imp_foo(%rip)
+  ret
diff --git a/lld/test/COFF/imports-static-lib.test b/lld/test/COFF/imports-static-lib.test
new file mode 100644
index 0000000000000..8e9525dab5284
--- /dev/null
+++ b/lld/test/COFF/imports-static-lib.test
@@ -0,0 +1,33 @@
+# REQUIRES: x86
+
+# Ensure that we don't import dllimport symbols from static (non-import) libraries
+# RUN: split-file %s %t.dir
+# RUN: llvm-mc --filetype=obj -triple=x86_64-windows-msvc %t.dir/foo.s -o %t.foo.obj
+# RUN: llvm-mc --filetype=obj -triple=x86_64-windows-msvc %t.dir/main.s -o %t.main.obj
+# RUN: llvm-lib %t.foo.obj -out:%t.foo.lib
+# RUN: not lld-link %t.foo.lib %t.main.obj -out:%t.dll -dll 2>&1 | FileCheck %s
+
+CHECK: error: undefined symbol: __declspec(dllimport) foo
+CHECK: NOTE: a relevant symbol 'foo' is available in {{.*}}.foo.lib but cannot be used because it is not an import library.
+
+# Now do the same thing, but import the symbol from a import library.
+# RUN: llvm-mc --filetype=obj -triple=x86_64-windows-msvc %t.dir/foo_dll_main.s -o %t.foo_dll_main.obj
+# RUN: lld-link /out:%t.dll /dll %t.foo.obj %t.foo_dll_main.obj /export:foo /implib:%t.foo.imp.lib
+# RUN: lld-link %t.main.obj %t.foo.imp.lib -out:%t.exe -dll
+
+#--- foo.s
+.text
+.globl foo
+foo:
+  ret
+#--- foo_dll_main.s
+.text
+.global _DllMainCRTStartup
+_DllMainCRTStartup:
+  ret
+#--- main.s
+.text
+.global _DllMainCRTStartup
+_DllMainCRTStartup:
+  call *__imp_foo(%rip)
+  ret
diff --git a/lld/test/COFF/undefined_lazy.test b/lld/test/COFF/undefined_lazy.test
deleted file mode 100644
index ed5cd358b5cd9..0000000000000
--- a/lld/test/COFF/undefined_lazy.test
+++ /dev/null
@@ -1,26 +0,0 @@
-# REQUIRES: x86
-
-# RUN: split-file %s %t.dir
-# RUN: llvm-mc --filetype=obj -triple=x86_64-windows-msvc %t.dir/foo.s -o %t.foo.obj
-# RUN: llvm-mc --filetype=obj -triple=x86_64-windows-msvc %t.dir/bar.s -o %t.bar.obj
-# RUN: llvm-mc --filetype=obj -triple=x86_64-windows-msvc %t.dir/qux.s -o %t.qux.obj
-# RUN: llvm-lib %t.foo.obj -out:%t.foo.lib
-# RUN: llvm-lib %t.bar.obj -out:%t.bar.lib
-# RUN: lld-link %t.foo.lib %t.bar.lib %t.qux.obj -out:%t.dll -dll
-#
-#--- foo.s
-.text
-.globl foo
-foo:
-  call bar
-#--- bar.s
-.text
-.globl bar
-bar:
-  ret
-#--- qux.s
-.text
-.global _DllMainCRTStartup
-_DllMainCRTStartup:
-  call *__imp_foo(%rip)
-  ret
diff --git a/lld/wasm/Config.h b/lld/wasm/Config.h
index 1fa6c42d9cd86..527edc11c48e3 100644
--- a/lld/wasm/Config.h
+++ b/lld/wasm/Config.h
@@ -32,6 +32,11 @@ class InputTable;
 class InputGlobal;
 class InputFunction;
 class Symbol;
+class DefinedData;
+class GlobalSymbol;
+class DefinedFunction;
+class UndefinedGlobal;
+class TableSymbol;
 
 // For --unresolved-symbols.
 enum class UnresolvedPolicy { ReportError, Warn, Ignore, ImportDynamic };
@@ -139,6 +144,107 @@ struct Ctx {
   llvm::SmallVector<InputGlobal *, 0> syntheticGlobals;
   llvm::SmallVector<InputTable *, 0> syntheticTables;
 
+  // linker-generated symbols
+  struct WasmSym {
+    // __global_base
+    // Symbol marking the start of the global section.
+    DefinedData *globalBase;
+
+    // __stack_pointer/__stack_low/__stack_high
+    // Global that holds current value of stack pointer and data symbols marking
+    // the start and end of the stack region.  stackPointer is initialized to
+    // stackHigh and grows downwards towards stackLow
+    GlobalSymbol *stackPointer;
+    DefinedData *stackLow;
+    DefinedData *stackHigh;
+
+    // __tls_base
+    // Global that holds the address of the base of the current thread's
+    // TLS block.
+    GlobalSymbol *tlsBase;
+
+    // __tls_size
+    // Symbol whose value is the size of the TLS block.
+    GlobalSymbol *tlsSize;
+
+    // __tls_size
+    // Symbol whose value is the alignment of the TLS block.
+    GlobalSymbol *tlsAlign;
+
+    // __data_end
+    // Symbol marking the end of the data and bss.
+    DefinedData *dataEnd;
+
+    // __heap_base/__heap_end
+    // Symbols marking the beginning and end of the "heap". It starts at the end
+    // of the data, bss and explicit stack, and extends to the end of the linear
+    // memory allocated by wasm-ld. This region of memory is not used by the
+    // linked code, so it may be used as a backing store for `sbrk` or `malloc`
+    // implementations.
+    DefinedData *heapBase;
+    DefinedData *heapEnd;
+
+    // __wasm_init_memory_flag
+    // Symbol whose contents are nonzero iff memory has already been
+    // initialized.
+    DefinedData *initMemoryFlag;
+
+    // __wasm_init_memory
+    // Function that initializes passive data segments during instantiation.
+    DefinedFunction *initMemory;
+
+    // __wasm_call_ctors
+    // Function that directly calls all ctors in priority order.
+    DefinedFunction *callCtors;
+
+    // __wasm_call_dtors
+    // Function that calls the libc/etc. cleanup function.
+    DefinedFunction *callDtors;
+
+    // __wasm_apply_global_relocs
+    // Function that applies relocations to wasm globals post-instantiation.
+    // Unlike __wasm_apply_data_relocs this needs to run on every thread.
+    DefinedFunction *applyGlobalRelocs;
+
+    // __wasm_apply_tls_relocs
+    // Like __wasm_apply_data_relocs but for TLS section.  These must be
+    // delayed until __wasm_init_tls.
+    DefinedFunction *applyTLSRelocs;
+
+    // __wasm_apply_global_tls_relocs
+    // Like applyGlobalRelocs but for globals that hold TLS addresses.  These
+    // must be delayed until __wasm_init_tls.
+    DefinedFunction *applyGlobalTLSRelocs;
+
+    // __wasm_init_tls
+    // Function that allocates thread-local storage and initializes it.
+    DefinedFunction *initTLS;
+
+    // Pointer to the function that is to be used in the start section.
+    // (normally an alias of initMemory, or applyGlobalRelocs).
+    DefinedFunction *startFunction;
+
+    // __dso_handle
+    // Symbol used in calls to __cxa_atexit to determine current DLL
+    DefinedData *dsoHandle;
+
+    // __table_base
+    // Used in PIC code for offset of indirect function table
+    UndefinedGlobal *tableBase;
+    DefinedData *definedTableBase;
+
+    // __memory_base
+    // Used in PIC code for offset of global data
+    UndefinedGlobal *memoryBase;
+    DefinedData *definedMemoryBase;
+
+    // __indirect_function_table
+    // Used as an address space for function pointers, with each function that
+    // is used as a function pointer being allocated a slot.
+    TableSymbol *indirectFunctionTable;
+  };
+  WasmSym sym;
+
   // True if we are creating position-independent code.
   bool isPic = false;
 
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index c3a74dde6480e..467c49e9981bc 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -70,6 +70,7 @@ void Ctx::reset() {
   isPic = false;
   legacyFunctionTable = false;
   emitBssSegments = false;
+  sym = WasmSym{};
 }
 
 namespace {
@@ -941,14 +942,14 @@ static void createSyntheticSymbols() {
                                                             true};
   static llvm::wasm::WasmGlobalType mutableGlobalTypeI64 = {WASM_TYPE_I64,
                                                             true};
-  WasmSym::callCtors = symtab->addSyntheticFunction(
+  ctx.sym.callCtors = symtab->addSyntheticFunction(
       "__wasm_call_ctors", WASM_SYMBOL_VISIBILITY_HIDDEN,
       make<SyntheticFunction>(nullSignature, "__wasm_call_ctors"));
 
   bool is64 = ctx.arg.is64.value_or(false);
 
   if (ctx.isPic) {
-    WasmSym::stackPointer =
+    ctx.sym.stackPointer =
         createUndefinedGlobal("__stack_pointer", ctx.arg.is64.value_or(false)
                                                      ? &mutableGlobalTypeI64
                                                      : &mutableGlobalTypeI32);
@@ -958,25 +959,24 @@ static void createSyntheticSymbols() {
     // See:
     // https://github.com/WebAssembly/tool-conventions/blob/main/DynamicLinking.md
     auto *globalType = is64 ? &globalTypeI64 : &globalTypeI32;
-    WasmSym::memoryBase = createUndefinedGlobal("__memory_base", globalType);
-    WasmSym::tableBase = createUndefinedGlobal("__table_base", globalType);
-    WasmSym::memoryBase->markLive();
-    WasmSym::tableBase->markLive();
+    ctx.sym.memoryBase = createUndefinedGlobal("__memory_base", globalType);
+    ctx.sym.tableBase = createUndefinedGlobal("__table_base", globalType);
+    ctx.sym.memoryBase->markLive();
+    ctx.sym.tableBase->markLive();
   } else {
     // For non-PIC code
-    WasmSym::stackPointer = createGlobalVariable("__stack_pointer", true);
-    WasmSym::stackPointer->markLive();
+    ctx.sym.stackPointer = createGlobalVariable("__stack_pointer", true);
+    ctx.sym.stackPointer->markLive();
   }
 
   if (ctx.arg.sharedMemory) {
-    WasmSym::tlsBase = createGlobalVariable("__tls_base", true);
-    WasmSym::tlsSize = createGlobalVariable("__tls_size", false);
-    WasmSym::tlsAlign = createGlobalVariable("__tls_align", false);
-    WasmSym::initTLS = symtab->addSyntheticFunction(
+    ctx.sym.tlsBase = createGlobalVariable("__tls_base", true);
+    ctx.sym.tlsSize = createGlobalVariable("__tls_size", false);
+    ctx.sym.tlsAlign = createGlobalVariable("__tls_align", false);
+    ctx.sym.initTLS = symtab->addSyntheticFunction(
         "__wasm_init_tls", WASM_SYMBOL_VISIBILITY_HIDDEN,
-        make<SyntheticFunction>(
-            is64 ? i64ArgSignature : i32ArgSignature,
-            "__wasm_init_tls"));
+        make<SyntheticFunction>(is64 ? i64ArgSignature : i32ArgSignature,
+                                "__wasm_init_tls"));
   }
 }
 
@@ -984,19 +984,19 @@ static void createOptionalSymbols() {
   if (ctx.arg.relocatable)
     return;
 
-  WasmSym::dsoHandle = symtab->addOptionalDataSymbol("__dso_handle");
+  ctx.sym.dsoHandle = symtab->addOptionalDataSymbol("__dso_handle");
 
   if (!ctx.arg.shared)
-    WasmSym::dataEnd = symtab->addOptionalDataSymbol("__data_end");
+    ctx.sym.dataEnd = symtab->addOptionalDataSymbol("__data_end");
 
   if (!ctx.isPic) {
-    WasmSym::stackLow = symtab->addOptionalDataSymbol("__stack_low");
-    WasmSym::stackHigh = symtab->addOptionalDataSymbol("__stack_high");
-    WasmSym::globalBase = symtab->addOptionalDataSymbol("__global_base");
-    WasmSym::heapBase = symtab->addOptionalDataSymbol("__heap_base");
-    WasmSym::heapEnd = symtab->addOptionalDataSymbol("__heap_end");
-    WasmSym::definedMemoryBase = symtab->addOptionalDataSymbol("__memory_base");
-    WasmSym::definedTableBase = symtab->addOptionalDataSymbol("__table_base");
+    ctx.sym.stackLow = symtab->addOptionalDataSymbol("__stack_low");
+    ctx.sym.stackHigh = symtab->addOptionalDataSymbol("__stack_high");
+    ctx.sym.globalBase = symtab->addOptionalDataSymbol("__global_base");
+    ctx.sym.heapBase = symtab->addOptionalDataSymbol("__heap_base");
+    ctx.sym.heapEnd = symtab->addOptionalDataSymbol("__heap_end");
+    ctx.sym.definedMemoryBase = symtab->addOptionalDataSymbol("__memory_base");
+    ctx.sym.definedTableBase = symtab->addOptionalDataSymbol("__table_base");
   }
 
   // For non-shared memory programs we still need to define __tls_base since we
@@ -1009,7 +1009,7 @@ static void createOptionalSymbols() {
   // __tls_size and __tls_align are not needed in this case since they are only
   // needed for __wasm_init_tls (which we do not create in this case).
   if (!ctx.arg.sharedMemory)
-    WasmSym::tlsBase = createOptionalGlobal("__tls_base", false);
+    ctx.sym.tlsBase = createOptionalGlobal("__tls_base", false);
 }
 
 static void processStubLibrariesPreLTO() {
@@ -1384,9 +1384,9 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   // by libc/etc., because destructors are registered dynamically with
   // `__cxa_atexit` and friends.
   if (!ctx.arg.relocatable && !ctx.arg.shared &&
-      !WasmSym::callCtors->isUsedInRegularObj &&
-      WasmSym::callCtors->getName() != ctx.arg.entry &&
-      !ctx.arg.exportedSymbols.count(WasmSym::callCtors->getName())) {
+      !ctx.sym.callCtors->isUsedInRegularObj &&
+      ctx.sym.callCtors->getName() != ctx.arg.entry &&
+      !ctx.arg.exportedSymbols.count(ctx.sym.callCtors->getName())) {
     if (Symbol *callDtors =
             handleUndefined("__wasm_call_dtors", "<internal>")) {
       if (auto *callDtorsFunc = dyn_cast<DefinedFunction>(callDtors)) {
@@ -1395,7 +1395,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
              !callDtorsFunc->signature->Returns.empty())) {
           error("__wasm_call_dtors must have no argument or return values");
         }
-        WasmSym::callDtors = callDtorsFunc;
+        ctx.sym.callDtors = callDtorsFunc;
       } else {
         error("__wasm_call_dtors must be a function");
       }
@@ -1488,7 +1488,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   markLive();
 
   // Provide the indirect function table if needed.
-  WasmSym::indirectFunctionTable =
+  ctx.sym.indirectFunctionTable =
       symtab->resolveIndirectFunctionTable(/*required =*/false);
 
   if (errorCount())
diff --git a/lld/wasm/InputChunks.cpp b/lld/wasm/InputChunks.cpp
index ccdc92f5c8d71..0e6c4e691be10 100644
--- a/lld/wasm/InputChunks.cpp
+++ b/lld/wasm/InputChunks.cpp
@@ -397,9 +397,9 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const {
     if (ctx.isPic) {
       writeU8(os, WASM_OPCODE_GLOBAL_GET, "GLOBAL_GET");
       if (isTLS())
-        writeUleb128(os, WasmSym::tlsBase->getGlobalIndex(), "tls_base");
+        writeUleb128(os, ctx.sym.tlsBase->getGlobalIndex(), "tls_base");
       else
-        writeUleb128(os, WasmSym::memoryBase->getGlobalIndex(), "memory_base");
+        writeUleb128(os, ctx.sym.memoryBase->getGlobalIndex(), "memory_base");
       writeU8(os, opcode_ptr_add, "ADD");
     }
 
@@ -422,12 +422,12 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const {
       }
     } else {
       assert(ctx.isPic);
-      const GlobalSymbol* baseSymbol = WasmSym::memoryBase;
+      const GlobalSymbol *baseSymbol = ctx.sym.memoryBase;
       if (rel.Type == R_WASM_TABLE_INDEX_I32 ||
           rel.Type == R_WASM_TABLE_INDEX_I64)
-        baseSymbol = WasmSym::tableBase;
+        baseSymbol = ctx.sym.tableBase;
       else if (sym->isTLS())
-        baseSymbol = WasmSym::tlsBase;
+        baseSymbol = ctx.sym.tlsBase;
       writeU8(os, WASM_OPCODE_GLOBAL_GET, "GLOBAL_GET");
       writeUleb128(os, baseSymbol->getGlobalIndex(), "base");
       writeU8(os, opcode_reloc_const, "CONST");
diff --git a/lld/wasm/MarkLive.cpp b/lld/wasm/MarkLive.cpp
index 13c7a3d894fe3..2b2cf19f14b30 100644
--- a/lld/wasm/MarkLive.cpp
+++ b/lld/wasm/MarkLive.cpp
@@ -114,8 +114,8 @@ void MarkLive::run() {
     if (sym->isNoStrip() || sym->isExported())
       enqueue(sym);
 
-  if (WasmSym::callDtors)
-    enqueue(WasmSym::callDtors);
+  if (ctx.sym.callDtors)
+    enqueue(ctx.sym.callDtors);
 
   for (const ObjFile *obj : ctx.objectFiles)
     if (obj->isLive()) {
@@ -131,7 +131,7 @@ void MarkLive::run() {
   // If we have any non-discarded init functions, mark `__wasm_call_ctors` as
   // live so that we assign it an index and call it.
   if (isCallCtorsLive())
-    WasmSym::callCtors->markLive();
+    ctx.sym.callCtors->markLive();
 }
 
 void MarkLive::mark() {
diff --git a/lld/wasm/OutputSections.cpp b/lld/wasm/OutputSections.cpp
index 95f7ecc29de6b..4142a913c8cbf 100644
--- a/lld/wasm/OutputSections.cpp
+++ b/lld/wasm/OutputSections.cpp
@@ -123,7 +123,7 @@ void DataSection::finalizeContents() {
     if ((segment->initFlags & WASM_DATA_SEGMENT_IS_PASSIVE) == 0) {
       if (ctx.isPic && ctx.arg.extendedConst) {
         writeU8(os, WASM_OPCODE_GLOBAL_GET, "global get");
-        writeUleb128(os, WasmSym::memoryBase->getGlobalIndex(),
+        writeUleb128(os, ctx.sym.memoryBase->getGlobalIndex(),
                      "literal (global index)");
         if (segment->startVA) {
           writePtrConst(os, segment->startVA, is64, "offset");
@@ -136,7 +136,7 @@ void DataSection::finalizeContents() {
         if (ctx.isPic) {
           assert(segment->startVA == 0);
           initExpr.Inst.Opcode = WASM_OPCODE_GLOBAL_GET;
-          initExpr.Inst.Value.Global = WasmSym::memoryBase->getGlobalIndex();
+          initExpr.Inst.Value.Global = ctx.sym.memoryBase->getGlobalIndex();
         } else {
           initExpr = intConst(segment->startVA, is64);
         }
diff --git a/lld/wasm/Symbols.cpp b/lld/wasm/Symbols.cpp
index a687fd6d6c4ef..92a933ecbb024 100644
--- a/lld/wasm/Symbols.cpp
+++ b/lld/wasm/Symbols.cpp
@@ -77,31 +77,6 @@ std::string toString(wasm::Symbol::Kind kind) {
 }
 
 namespace wasm {
-DefinedFunction *WasmSym::callCtors;
-DefinedFunction *WasmSym::callDtors;
-DefinedFunction *WasmSym::initMemory;
-DefinedFunction *WasmSym::applyGlobalRelocs;
-DefinedFunction *WasmSym::applyTLSRelocs;
-DefinedFunction *WasmSym::applyGlobalTLSRelocs;
-DefinedFunction *WasmSym::initTLS;
-DefinedFunction *WasmSym::startFunction;
-DefinedData *WasmSym::dsoHandle;
-DefinedData *WasmSym::dataEnd;
-DefinedData *WasmSym::globalBase;
-DefinedData *WasmSym::heapBase;
-DefinedData *WasmSym::heapEnd;
-DefinedData *WasmSym::initMemoryFlag;
-GlobalSymbol *WasmSym::stackPointer;
-DefinedData *WasmSym::stackLow;
-DefinedData *WasmSym::stackHigh;
-GlobalSymbol *WasmSym::tlsBase;
-GlobalSymbol *WasmSym::tlsSize;
-GlobalSymbol *WasmSym::tlsAlign;
-UndefinedGlobal *WasmSym::tableBase;
-DefinedData *WasmSym::definedTableBase;
-UndefinedGlobal *WasmSym::memoryBase;
-DefinedData *WasmSym::definedMemoryBase;
-TableSymbol *WasmSym::indirectFunctionTable;
 
 WasmSymbolType Symbol::getWasmType() const {
   if (isa<FunctionSymbol>(this))
diff --git a/lld/wasm/Symbols.h b/lld/wasm/Symbols.h
index b409fffc50a6c..55ee21939ce07 100644
--- a/lld/wasm/Symbols.h
+++ b/lld/wasm/Symbols.h
@@ -537,105 +537,6 @@ class LazySymbol : public Symbol {
   const WasmSignature *signature = nullptr;
 };
 
-// linker-generated symbols
-struct WasmSym {
-  // __global_base
-  // Symbol marking the start of the global section.
-  static DefinedData *globalBase;
-
-  // __stack_pointer/__stack_low/__stack_high
-  // Global that holds current value of stack pointer and data symbols marking
-  // the start and end of the stack region.  stackPointer is initialized to
-  // stackHigh and grows downwards towards stackLow
-  static GlobalSymbol *stackPointer;
-  static DefinedData *stackLow;
-  static DefinedData *stackHigh;
-
-  // __tls_base
-  // Global that holds the address of the base of the current thread's
-  // TLS block.
-  static GlobalSymbol *tlsBase;
-
-  // __tls_size
-  // Symbol whose value is the size of the TLS block.
-  static GlobalSymbol *tlsSize;
-
-  // __tls_size
-  // Symbol whose value is the alignment of the TLS block.
-  static GlobalSymbol *tlsAlign;
-
-  // __data_end
-  // Symbol marking the end of the data and bss.
-  static DefinedData *dataEnd;
-
-  // __heap_base/__heap_end
-  // Symbols marking the beginning and end of the "heap". It starts at the end
-  // of the data, bss and explicit stack, and extends to the end of the linear
-  // memory allocated by wasm-ld. This region of memory is not used by the
-  // linked code, so it may be used as a backing store for `sbrk` or `malloc`
-  // implementations.
-  static DefinedData *heapBase;
-  static DefinedData *heapEnd;
-
-  // __wasm_init_memory_flag
-  // Symbol whose contents are nonzero iff memory has already been initialized.
-  static DefinedData *initMemoryFlag;
-
-  // __wasm_init_memory
-  // Function that initializes passive data segments during instantiation.
-  static DefinedFunction *initMemory;
-
-  // __wasm_call_ctors
-  // Function that directly calls all ctors in priority order.
-  static DefinedFunction *callCtors;
-
-  // __wasm_call_dtors
-  // Function that calls the libc/etc. cleanup function.
-  static DefinedFunction *callDtors;
-
-  // __wasm_apply_global_relocs
-  // Function that applies relocations to wasm globals post-instantiation.
-  // Unlike __wasm_apply_data_relocs this needs to run on every thread.
-  static DefinedFunction *applyGlobalRelocs;
-
-  // __wasm_apply_tls_relocs
-  // Like __wasm_apply_data_relocs but for TLS section.  These must be
-  // delayed until __wasm_init_tls.
-  static DefinedFunction *applyTLSRelocs;
-
-  // __wasm_apply_global_tls_relocs
-  // Like applyGlobalRelocs but for globals that hold TLS addresses.  These
-  // must be delayed until __wasm_init_tls.
-  static DefinedFunction *applyGlobalTLSRelocs;
-
-  // __wasm_init_tls
-  // Function that allocates thread-local storage and initializes it.
-  static DefinedFunction *initTLS;
-
-  // Pointer to the function that is to be used in the start section.
-  // (normally an alias of initMemory, or applyGlobalRelocs).
-  static DefinedFunction *startFunction;
-
-  // __dso_handle
-  // Symbol used in calls to __cxa_atexit to determine current DLL
-  static DefinedData *dsoHandle;
-
-  // __table_base
-  // Used in PIC code for offset of indirect function table
-  static UndefinedGlobal *tableBase;
-  static DefinedData *definedTableBase;
-
-  // __memory_base
-  // Used in PIC code for offset of global data
-  static UndefinedGlobal *memoryBase;
-  static DefinedData *definedMemoryBase;
-
-  // __indirect_function_table
-  // Used as an address space for function pointers, with each function that is
-  // used as a function pointer being allocated a slot.
-  static TableSymbol *indirectFunctionTable;
-};
-
 // A buffer class that is large enough to hold any Symbol-derived
 // object. We allocate memory using this class and instantiate a symbol
 // using the placement new.
diff --git a/lld/wasm/SyntheticSections.cpp b/lld/wasm/SyntheticSections.cpp
index 7fb44b9f0c009..0e2aa57e9048e 100644
--- a/lld/wasm/SyntheticSections.cpp
+++ b/lld/wasm/SyntheticSections.cpp
@@ -319,8 +319,8 @@ void TableSection::addTable(InputTable *table) {
   // Some inputs require that the indirect function table be assigned to table
   // number 0.
   if (ctx.legacyFunctionTable &&
-      isa<DefinedTable>(WasmSym::indirectFunctionTable) &&
-      cast<DefinedTable>(WasmSym::indirectFunctionTable)->table == table) {
+      isa<DefinedTable>(ctx.sym.indirectFunctionTable) &&
+      cast<DefinedTable>(ctx.sym.indirectFunctionTable)->table == table) {
     if (out.importSec->getNumImportedTables()) {
       // Alack!  Some other input imported a table, meaning that we are unable
       // to assign table number 0 to the indirect function table.
@@ -395,8 +395,8 @@ void GlobalSection::assignIndexes() {
 }
 
 static void ensureIndirectFunctionTable() {
-  if (!WasmSym::indirectFunctionTable)
-    WasmSym::indirectFunctionTable =
+  if (!ctx.sym.indirectFunctionTable)
+    ctx.sym.indirectFunctionTable =
         symtab->resolveIndirectFunctionTable(/*required =*/true);
 }
 
@@ -430,10 +430,9 @@ void GlobalSection::generateRelocationCode(raw_ostream &os, bool TLS) const {
       // Get __memory_base
       writeU8(os, WASM_OPCODE_GLOBAL_GET, "GLOBAL_GET");
       if (sym->isTLS())
-        writeUleb128(os, WasmSym::tlsBase->getGlobalIndex(), "__tls_base");
+        writeUleb128(os, ctx.sym.tlsBase->getGlobalIndex(), "__tls_base");
       else
-        writeUleb128(os, WasmSym::memoryBase->getGlobalIndex(),
-                     "__memory_base");
+        writeUleb128(os, ctx.sym.memoryBase->getGlobalIndex(), "__memory_base");
 
       // Add the virtual address of the data symbol
       writeU8(os, opcode_ptr_const, "CONST");
@@ -443,7 +442,7 @@ void GlobalSection::generateRelocationCode(raw_ostream &os, bool TLS) const {
         continue;
       // Get __table_base
       writeU8(os, WASM_OPCODE_GLOBAL_GET, "GLOBAL_GET");
-      writeUleb128(os, WasmSym::tableBase->getGlobalIndex(), "__table_base");
+      writeUleb128(os, ctx.sym.tableBase->getGlobalIndex(), "__table_base");
 
       // Add the table index to __table_base
       writeU8(os, opcode_ptr_const, "CONST");
@@ -490,13 +489,13 @@ void GlobalSection::writeBody() {
     if (ctx.arg.extendedConst && ctx.isPic) {
       if (auto *d = dyn_cast<DefinedData>(sym)) {
         if (!sym->isTLS()) {
-          globalIdx = WasmSym::memoryBase->getGlobalIndex();
+          globalIdx = ctx.sym.memoryBase->getGlobalIndex();
           offset = d->getVA();
           useExtendedConst = true;
         }
       } else if (auto *f = dyn_cast<FunctionSymbol>(sym)) {
         if (!sym->isStub) {
-          globalIdx = WasmSym::tableBase->getGlobalIndex();
+          globalIdx = ctx.sym.tableBase->getGlobalIndex();
           offset = f->getTableIndex();
           useExtendedConst = true;
         }
@@ -550,14 +549,11 @@ void ExportSection::writeBody() {
     writeExport(os, export_);
 }
 
-bool StartSection::isNeeded() const {
-  return WasmSym::startFunction != nullptr;
-}
+bool StartSection::isNeeded() const { return ctx.sym.startFunction != nullptr; }
 
 void StartSection::writeBody() {
   raw_ostream &os = bodyOutputStream;
-  writeUleb128(os, WasmSym::startFunction->getFunctionIndex(),
-               "function index");
+  writeUleb128(os, ctx.sym.startFunction->getFunctionIndex(), "function index");
 }
 
 void ElemSection::addEntry(FunctionSymbol *sym) {
@@ -573,9 +569,9 @@ void ElemSection::addEntry(FunctionSymbol *sym) {
 void ElemSection::writeBody() {
   raw_ostream &os = bodyOutputStream;
 
-  assert(WasmSym::indirectFunctionTable);
+  assert(ctx.sym.indirectFunctionTable);
   writeUleb128(os, 1, "segment count");
-  uint32_t tableNumber = WasmSym::indirectFunctionTable->getTableNumber();
+  uint32_t tableNumber = ctx.sym.indirectFunctionTable->getTableNumber();
   uint32_t flags = 0;
   if (tableNumber)
     flags |= WASM_ELEM_SEGMENT_HAS_TABLE_NUMBER;
@@ -587,7 +583,7 @@ void ElemSection::writeBody() {
   initExpr.Extended = false;
   if (ctx.isPic) {
     initExpr.Inst.Opcode = WASM_OPCODE_GLOBAL_GET;
-    initExpr.Inst.Value.Global = WasmSym::tableBase->getGlobalIndex();
+    initExpr.Inst.Value.Global = ctx.sym.tableBase->getGlobalIndex();
   } else {
     bool is64 = ctx.arg.is64.value_or(false);
     initExpr = intConst(ctx.arg.tableBase, is64);
diff --git a/lld/wasm/Writer.cpp b/lld/wasm/Writer.cpp
index 76e38f548157c..2bf4b370a7dbd 100644
--- a/lld/wasm/Writer.cpp
+++ b/lld/wasm/Writer.cpp
@@ -340,16 +340,16 @@ void Writer::layoutMemory() {
     if (ctx.arg.relocatable || ctx.isPic)
       return;
     memoryPtr = alignTo(memoryPtr, stackAlignment);
-    if (WasmSym::stackLow)
-      WasmSym::stackLow->setVA(memoryPtr);
+    if (ctx.sym.stackLow)
+      ctx.sym.stackLow->setVA(memoryPtr);
     if (ctx.arg.zStackSize != alignTo(ctx.arg.zStackSize, stackAlignment))
       error("stack size must be " + Twine(stackAlignment) + "-byte aligned");
     log("mem: stack size  = " + Twine(ctx.arg.zStackSize));
     log("mem: stack base  = " + Twine(memoryPtr));
     memoryPtr += ctx.arg.zStackSize;
-    setGlobalPtr(cast<DefinedGlobal>(WasmSym::stackPointer), memoryPtr);
-    if (WasmSym::stackHigh)
-      WasmSym::stackHigh->setVA(memoryPtr);
+    setGlobalPtr(cast<DefinedGlobal>(ctx.sym.stackPointer), memoryPtr);
+    if (ctx.sym.stackHigh)
+      ctx.sym.stackHigh->setVA(memoryPtr);
     log("mem: stack top   = " + Twine(memoryPtr));
   };
 
@@ -367,15 +367,15 @@ void Writer::layoutMemory() {
   }
 
   log("mem: global base = " + Twine(memoryPtr));
-  if (WasmSym::globalBase)
-    WasmSym::globalBase->setVA(memoryPtr);
+  if (ctx.sym.globalBase)
+    ctx.sym.globalBase->setVA(memoryPtr);
 
   uint64_t dataStart = memoryPtr;
 
   // Arbitrarily set __dso_handle handle to point to the start of the data
   // segments.
-  if (WasmSym::dsoHandle)
-    WasmSym::dsoHandle->setVA(dataStart);
+  if (ctx.sym.dsoHandle)
+    ctx.sym.dsoHandle->setVA(dataStart);
 
   out.dylinkSec->memAlign = 0;
   for (OutputSegment *seg : segments) {
@@ -386,16 +386,16 @@ void Writer::layoutMemory() {
                 memoryPtr, seg->size, seg->alignment));
 
     if (!ctx.arg.relocatable && seg->isTLS()) {
-      if (WasmSym::tlsSize) {
-        auto *tlsSize = cast<DefinedGlobal>(WasmSym::tlsSize);
+      if (ctx.sym.tlsSize) {
+        auto *tlsSize = cast<DefinedGlobal>(ctx.sym.tlsSize);
         setGlobalPtr(tlsSize, seg->size);
       }
-      if (WasmSym::tlsAlign) {
-        auto *tlsAlign = cast<DefinedGlobal>(WasmSym::tlsAlign);
+      if (ctx.sym.tlsAlign) {
+        auto *tlsAlign = cast<DefinedGlobal>(ctx.sym.tlsAlign);
         setGlobalPtr(tlsAlign, int64_t{1} << seg->alignment);
       }
-      if (!ctx.arg.sharedMemory && WasmSym::tlsBase) {
-        auto *tlsBase = cast<DefinedGlobal>(WasmSym::tlsBase);
+      if (!ctx.arg.sharedMemory && ctx.sym.tlsBase) {
+        auto *tlsBase = cast<DefinedGlobal>(ctx.sym.tlsBase);
         setGlobalPtr(tlsBase, memoryPtr);
       }
     }
@@ -406,17 +406,17 @@ void Writer::layoutMemory() {
   // Make space for the memory initialization flag
   if (ctx.arg.sharedMemory && hasPassiveInitializedSegments()) {
     memoryPtr = alignTo(memoryPtr, 4);
-    WasmSym::initMemoryFlag = symtab->addSyntheticDataSymbol(
+    ctx.sym.initMemoryFlag = symtab->addSyntheticDataSymbol(
         "__wasm_init_memory_flag", WASM_SYMBOL_VISIBILITY_HIDDEN);
-    WasmSym::initMemoryFlag->markLive();
-    WasmSym::initMemoryFlag->setVA(memoryPtr);
+    ctx.sym.initMemoryFlag->markLive();
+    ctx.sym.initMemoryFlag->setVA(memoryPtr);
     log(formatv("mem: {0,-15} offset={1,-8} size={2,-8} align={3}",
                 "__wasm_init_memory_flag", memoryPtr, 4, 4));
     memoryPtr += 4;
   }
 
-  if (WasmSym::dataEnd)
-    WasmSym::dataEnd->setVA(memoryPtr);
+  if (ctx.sym.dataEnd)
+    ctx.sym.dataEnd->setVA(memoryPtr);
 
   uint64_t staticDataSize = memoryPtr - dataStart;
   log("mem: static data = " + Twine(staticDataSize));
@@ -426,7 +426,7 @@ void Writer::layoutMemory() {
   if (!ctx.arg.stackFirst)
     placeStack();
 
-  if (WasmSym::heapBase) {
+  if (ctx.sym.heapBase) {
     // Set `__heap_base` to follow the end of the stack or global data. The
     // fact that this comes last means that a malloc/brk implementation can
     // grow the heap at runtime.
@@ -434,7 +434,7 @@ void Writer::layoutMemory() {
     // __heap_base to be aligned already.
     memoryPtr = alignTo(memoryPtr, heapAlignment);
     log("mem: heap base   = " + Twine(memoryPtr));
-    WasmSym::heapBase->setVA(memoryPtr);
+    ctx.sym.heapBase->setVA(memoryPtr);
   }
 
   uint64_t maxMemorySetting = 1ULL << 32;
@@ -470,12 +470,12 @@ void Writer::layoutMemory() {
   out.memorySec->numMemoryPages = memoryPtr / WasmPageSize;
   log("mem: total pages = " + Twine(out.memorySec->numMemoryPages));
 
-  if (WasmSym::heapEnd) {
+  if (ctx.sym.heapEnd) {
     // Set `__heap_end` to follow the end of the statically allocated linear
     // memory. The fact that this comes last means that a malloc/brk
     // implementation can grow the heap at runtime.
     log("mem: heap end    = " + Twine(memoryPtr));
-    WasmSym::heapEnd->setVA(memoryPtr);
+    ctx.sym.heapEnd->setVA(memoryPtr);
   }
 
   uint64_t maxMemory = 0;
@@ -758,14 +758,14 @@ void Writer::calculateImports() {
   // Some inputs require that the indirect function table be assigned to table
   // number 0, so if it is present and is an import, allocate it before any
   // other tables.
-  if (WasmSym::indirectFunctionTable &&
-      shouldImport(WasmSym::indirectFunctionTable))
-    out.importSec->addImport(WasmSym::indirectFunctionTable);
+  if (ctx.sym.indirectFunctionTable &&
+      shouldImport(ctx.sym.indirectFunctionTable))
+    out.importSec->addImport(ctx.sym.indirectFunctionTable);
 
   for (Symbol *sym : symtab->symbols()) {
     if (!shouldImport(sym))
       continue;
-    if (sym == WasmSym::indirectFunctionTable)
+    if (sym == ctx.sym.indirectFunctionTable)
       continue;
     LLVM_DEBUG(dbgs() << "import: " << sym->getName() << "\n");
     out.importSec->addImport(sym);
@@ -879,7 +879,7 @@ void Writer::createCommandExportWrappers() {
 
   // If there are no ctors and there's no libc `__wasm_call_dtors` to
   // call, don't wrap the exports.
-  if (initFunctions.empty() && WasmSym::callDtors == nullptr)
+  if (initFunctions.empty() && ctx.sym.callDtors == nullptr)
     return;
 
   std::vector<DefinedFunction *> toWrap;
@@ -919,27 +919,27 @@ void Writer::createCommandExportWrappers() {
 }
 
 static void finalizeIndirectFunctionTable() {
-  if (!WasmSym::indirectFunctionTable)
+  if (!ctx.sym.indirectFunctionTable)
     return;
 
-  if (shouldImport(WasmSym::indirectFunctionTable) &&
-      !WasmSym::indirectFunctionTable->hasTableNumber()) {
+  if (shouldImport(ctx.sym.indirectFunctionTable) &&
+      !ctx.sym.indirectFunctionTable->hasTableNumber()) {
     // Processing -Bsymbolic relocations resulted in a late requirement that the
     // indirect function table be present, and we are running in --import-table
     // mode.  Add the table now to the imports section.  Otherwise it will be
     // added to the tables section later in assignIndexes.
-    out.importSec->addImport(WasmSym::indirectFunctionTable);
+    out.importSec->addImport(ctx.sym.indirectFunctionTable);
   }
 
   uint32_t tableSize = ctx.arg.tableBase + out.elemSec->numEntries();
   WasmLimits limits = {0, tableSize, 0};
-  if (WasmSym::indirectFunctionTable->isDefined() && !ctx.arg.growableTable) {
+  if (ctx.sym.indirectFunctionTable->isDefined() && !ctx.arg.growableTable) {
     limits.Flags |= WASM_LIMITS_FLAG_HAS_MAX;
     limits.Maximum = limits.Minimum;
   }
   if (ctx.arg.is64.value_or(false))
     limits.Flags |= WASM_LIMITS_FLAG_IS_64;
-  WasmSym::indirectFunctionTable->setLimits(limits);
+  ctx.sym.indirectFunctionTable->setLimits(limits);
 }
 
 static void scanRelocations() {
@@ -1142,26 +1142,26 @@ void Writer::createSyntheticInitFunctions() {
   // We also initialize bss segments (using memory.fill) as part of this
   // function.
   if (hasPassiveInitializedSegments()) {
-    WasmSym::initMemory = symtab->addSyntheticFunction(
+    ctx.sym.initMemory = symtab->addSyntheticFunction(
         "__wasm_init_memory", WASM_SYMBOL_VISIBILITY_HIDDEN,
         make<SyntheticFunction>(nullSignature, "__wasm_init_memory"));
-    WasmSym::initMemory->markLive();
+    ctx.sym.initMemory->markLive();
     if (ctx.arg.sharedMemory) {
       // This global is assigned during  __wasm_init_memory in the shared memory
       // case.
-      WasmSym::tlsBase->markLive();
+      ctx.sym.tlsBase->markLive();
     }
   }
 
   if (ctx.arg.sharedMemory) {
     if (out.globalSec->needsTLSRelocations()) {
-      WasmSym::applyGlobalTLSRelocs = symtab->addSyntheticFunction(
+      ctx.sym.applyGlobalTLSRelocs = symtab->addSyntheticFunction(
           "__wasm_apply_global_tls_relocs", WASM_SYMBOL_VISIBILITY_HIDDEN,
           make<SyntheticFunction>(nullSignature,
                                   "__wasm_apply_global_tls_relocs"));
-      WasmSym::applyGlobalTLSRelocs->markLive();
+      ctx.sym.applyGlobalTLSRelocs->markLive();
       // TLS relocations depend on  the __tls_base symbols
-      WasmSym::tlsBase->markLive();
+      ctx.sym.tlsBase->markLive();
     }
 
     auto hasTLSRelocs = [](const OutputSegment *segment) {
@@ -1172,40 +1172,39 @@ void Writer::createSyntheticInitFunctions() {
       return false;
     };
     if (llvm::any_of(segments, hasTLSRelocs)) {
-      WasmSym::applyTLSRelocs = symtab->addSyntheticFunction(
+      ctx.sym.applyTLSRelocs = symtab->addSyntheticFunction(
           "__wasm_apply_tls_relocs", WASM_SYMBOL_VISIBILITY_HIDDEN,
-          make<SyntheticFunction>(nullSignature,
-                                  "__wasm_apply_tls_relocs"));
-      WasmSym::applyTLSRelocs->markLive();
+          make<SyntheticFunction>(nullSignature, "__wasm_apply_tls_relocs"));
+      ctx.sym.applyTLSRelocs->markLive();
     }
   }
 
   if (ctx.isPic && out.globalSec->needsRelocations()) {
-    WasmSym::applyGlobalRelocs = symtab->addSyntheticFunction(
+    ctx.sym.applyGlobalRelocs = symtab->addSyntheticFunction(
         "__wasm_apply_global_relocs", WASM_SYMBOL_VISIBILITY_HIDDEN,
         make<SyntheticFunction>(nullSignature, "__wasm_apply_global_relocs"));
-    WasmSym::applyGlobalRelocs->markLive();
+    ctx.sym.applyGlobalRelocs->markLive();
   }
 
   // If there is only one start function we can just use that function
   // itself as the Wasm start function, otherwise we need to synthesize
   // a new function to call them in sequence.
-  if (WasmSym::applyGlobalRelocs && WasmSym::initMemory) {
-    WasmSym::startFunction = symtab->addSyntheticFunction(
+  if (ctx.sym.applyGlobalRelocs && ctx.sym.initMemory) {
+    ctx.sym.startFunction = symtab->addSyntheticFunction(
         "__wasm_start", WASM_SYMBOL_VISIBILITY_HIDDEN,
         make<SyntheticFunction>(nullSignature, "__wasm_start"));
-    WasmSym::startFunction->markLive();
+    ctx.sym.startFunction->markLive();
   }
 }
 
 void Writer::createInitMemoryFunction() {
   LLVM_DEBUG(dbgs() << "createInitMemoryFunction\n");
-  assert(WasmSym::initMemory);
+  assert(ctx.sym.initMemory);
   assert(hasPassiveInitializedSegments());
   uint64_t flagAddress;
   if (ctx.arg.sharedMemory) {
-    assert(WasmSym::initMemoryFlag);
-    flagAddress = WasmSym::initMemoryFlag->getVA();
+    assert(ctx.sym.initMemoryFlag);
+    flagAddress = ctx.sym.initMemoryFlag->getVA();
   }
   bool is64 = ctx.arg.is64.value_or(false);
   std::string bodyContent;
@@ -1278,7 +1277,7 @@ void Writer::createInitMemoryFunction() {
         writeUleb128(os, 2, "local count");
         writeU8(os, is64 ? WASM_TYPE_I64 : WASM_TYPE_I32, "address type");
         writeU8(os, WASM_OPCODE_GLOBAL_GET, "GLOBAL_GET");
-        writeUleb128(os, WasmSym::memoryBase->getGlobalIndex(), "memory_base");
+        writeUleb128(os, ctx.sym.memoryBase->getGlobalIndex(), "memory_base");
         writePtrConst(os, flagAddress, is64, "flag address");
         writeU8(os, is64 ? WASM_OPCODE_I64_ADD : WASM_OPCODE_I32_ADD, "add");
         writeU8(os, WASM_OPCODE_LOCAL_SET, "local.set");
@@ -1325,7 +1324,7 @@ void Writer::createInitMemoryFunction() {
         writePtrConst(os, s->startVA, is64, "destination address");
         if (ctx.isPic) {
           writeU8(os, WASM_OPCODE_GLOBAL_GET, "GLOBAL_GET");
-          writeUleb128(os, WasmSym::memoryBase->getGlobalIndex(),
+          writeUleb128(os, ctx.sym.memoryBase->getGlobalIndex(),
                        "__memory_base");
           writeU8(os, is64 ? WASM_OPCODE_I64_ADD : WASM_OPCODE_I32_ADD,
                   "i32.add");
@@ -1343,8 +1342,7 @@ void Writer::createInitMemoryFunction() {
             writePtrConst(os, s->startVA, is64, "destination address");
           }
           writeU8(os, WASM_OPCODE_GLOBAL_SET, "GLOBAL_SET");
-          writeUleb128(os, WasmSym::tlsBase->getGlobalIndex(),
-                       "__tls_base");
+          writeUleb128(os, ctx.sym.tlsBase->getGlobalIndex(), "__tls_base");
           if (ctx.isPic) {
             writeU8(os, WASM_OPCODE_LOCAL_GET, "local.tee");
             writeUleb128(os, 1, "local 1");
@@ -1420,30 +1418,30 @@ void Writer::createInitMemoryFunction() {
     writeU8(os, WASM_OPCODE_END, "END");
   }
 
-  createFunction(WasmSym::initMemory, bodyContent);
+  createFunction(ctx.sym.initMemory, bodyContent);
 }
 
 void Writer::createStartFunction() {
   // If the start function exists when we have more than one function to call.
-  if (WasmSym::initMemory && WasmSym::applyGlobalRelocs) {
-    assert(WasmSym::startFunction);
+  if (ctx.sym.initMemory && ctx.sym.applyGlobalRelocs) {
+    assert(ctx.sym.startFunction);
     std::string bodyContent;
     {
       raw_string_ostream os(bodyContent);
       writeUleb128(os, 0, "num locals");
       writeU8(os, WASM_OPCODE_CALL, "CALL");
-      writeUleb128(os, WasmSym::applyGlobalRelocs->getFunctionIndex(),
+      writeUleb128(os, ctx.sym.applyGlobalRelocs->getFunctionIndex(),
                    "function index");
       writeU8(os, WASM_OPCODE_CALL, "CALL");
-      writeUleb128(os, WasmSym::initMemory->getFunctionIndex(),
+      writeUleb128(os, ctx.sym.initMemory->getFunctionIndex(),
                    "function index");
       writeU8(os, WASM_OPCODE_END, "END");
     }
-    createFunction(WasmSym::startFunction, bodyContent);
-  } else if (WasmSym::initMemory) {
-    WasmSym::startFunction = WasmSym::initMemory;
-  } else if (WasmSym::applyGlobalRelocs) {
-    WasmSym::startFunction = WasmSym::applyGlobalRelocs;
+    createFunction(ctx.sym.startFunction, bodyContent);
+  } else if (ctx.sym.initMemory) {
+    ctx.sym.startFunction = ctx.sym.initMemory;
+  } else if (ctx.sym.applyGlobalRelocs) {
+    ctx.sym.startFunction = ctx.sym.applyGlobalRelocs;
   }
 }
 
@@ -1497,7 +1495,7 @@ void Writer::createApplyTLSRelocationsFunction() {
     writeU8(os, WASM_OPCODE_END, "END");
   }
 
-  createFunction(WasmSym::applyTLSRelocs, bodyContent);
+  createFunction(ctx.sym.applyTLSRelocs, bodyContent);
 }
 
 // Similar to createApplyDataRelocationsFunction but generates relocation code
@@ -1513,7 +1511,7 @@ void Writer::createApplyGlobalRelocationsFunction() {
     writeU8(os, WASM_OPCODE_END, "END");
   }
 
-  createFunction(WasmSym::applyGlobalRelocs, bodyContent);
+  createFunction(ctx.sym.applyGlobalRelocs, bodyContent);
 }
 
 // Similar to createApplyGlobalRelocationsFunction but for
@@ -1529,7 +1527,7 @@ void Writer::createApplyGlobalTLSRelocationsFunction() {
     writeU8(os, WASM_OPCODE_END, "END");
   }
 
-  createFunction(WasmSym::applyGlobalTLSRelocs, bodyContent);
+  createFunction(ctx.sym.applyGlobalTLSRelocs, bodyContent);
 }
 
 // Create synthetic "__wasm_call_ctors" function based on ctor functions
@@ -1537,7 +1535,7 @@ void Writer::createApplyGlobalTLSRelocationsFunction() {
 void Writer::createCallCtorsFunction() {
   // If __wasm_call_ctors isn't referenced, there aren't any ctors, don't
   // define the `__wasm_call_ctors` function.
-  if (!WasmSym::callCtors->isLive() && initFunctions.empty())
+  if (!ctx.sym.callCtors->isLive() && initFunctions.empty())
     return;
 
   // First write the body's contents to a string.
@@ -1558,7 +1556,7 @@ void Writer::createCallCtorsFunction() {
     writeU8(os, WASM_OPCODE_END, "END");
   }
 
-  createFunction(WasmSym::callCtors, bodyContent);
+  createFunction(ctx.sym.callCtors, bodyContent);
 }
 
 // Create a wrapper around a function export which calls the
@@ -1573,10 +1571,9 @@ void Writer::createCommandExportWrapper(uint32_t functionIndex,
 
     // Call `__wasm_call_ctors` which call static constructors (and
     // applies any runtime relocations in Emscripten-style PIC mode)
-    if (WasmSym::callCtors->isLive()) {
+    if (ctx.sym.callCtors->isLive()) {
       writeU8(os, WASM_OPCODE_CALL, "CALL");
-      writeUleb128(os, WasmSym::callCtors->getFunctionIndex(),
-                   "function index");
+      writeUleb128(os, ctx.sym.callCtors->getFunctionIndex(), "function index");
     }
 
     // Call the user's code, leaving any return values on the operand stack.
@@ -1588,7 +1585,7 @@ void Writer::createCommandExportWrapper(uint32_t functionIndex,
     writeUleb128(os, functionIndex, "function index");
 
     // Call the function that calls the destructors.
-    if (DefinedFunction *callDtors = WasmSym::callDtors) {
+    if (DefinedFunction *callDtors = ctx.sym.callDtors) {
       writeU8(os, WASM_OPCODE_CALL, "CALL");
       writeUleb128(os, callDtors->getFunctionIndex(), "function index");
     }
@@ -1619,7 +1616,7 @@ void Writer::createInitTLSFunction() {
       writeUleb128(os, 0, "local index");
 
       writeU8(os, WASM_OPCODE_GLOBAL_SET, "global.set");
-      writeUleb128(os, WasmSym::tlsBase->getGlobalIndex(), "global index");
+      writeUleb128(os, ctx.sym.tlsBase->getGlobalIndex(), "global index");
 
       // FIXME(wvo): this local needs to be I64 in wasm64, or we need an extend op.
       writeU8(os, WASM_OPCODE_LOCAL_GET, "local.get");
@@ -1635,28 +1632,28 @@ void Writer::createInitTLSFunction() {
       writeU8(os, 0, "memory index immediate");
     }
 
-    if (WasmSym::applyTLSRelocs) {
+    if (ctx.sym.applyTLSRelocs) {
       writeU8(os, WASM_OPCODE_CALL, "CALL");
-      writeUleb128(os, WasmSym::applyTLSRelocs->getFunctionIndex(),
+      writeUleb128(os, ctx.sym.applyTLSRelocs->getFunctionIndex(),
                    "function index");
     }
 
-    if (WasmSym::applyGlobalTLSRelocs) {
+    if (ctx.sym.applyGlobalTLSRelocs) {
       writeU8(os, WASM_OPCODE_CALL, "CALL");
-      writeUleb128(os, WasmSym::applyGlobalTLSRelocs->getFunctionIndex(),
+      writeUleb128(os, ctx.sym.applyGlobalTLSRelocs->getFunctionIndex(),
                    "function index");
     }
     writeU8(os, WASM_OPCODE_END, "end function");
   }
 
-  createFunction(WasmSym::initTLS, bodyContent);
+  createFunction(ctx.sym.initTLS, bodyContent);
 }
 
 // Populate InitFunctions vector with init functions from all input objects.
 // This is then used either when creating the output linking section or to
 // synthesize the "__wasm_call_ctors" function.
 void Writer::calculateInitFunctions() {
-  if (!ctx.arg.relocatable && !WasmSym::callCtors->isLive())
+  if (!ctx.arg.relocatable && !ctx.sym.callCtors->isLive())
     return;
 
   for (ObjFile *file : ctx.objectFiles) {
@@ -1707,8 +1704,8 @@ void Writer::createSyntheticSectionsPostLayout() {
 void Writer::run() {
   // For PIC code the table base is assigned dynamically by the loader.
   // For non-PIC, we start at 1 so that accessing table index 0 always traps.
-  if (!ctx.isPic && WasmSym::definedTableBase)
-    WasmSym::definedTableBase->setVA(ctx.arg.tableBase);
+  if (!ctx.isPic && ctx.sym.definedTableBase)
+    ctx.sym.definedTableBase->setVA(ctx.arg.tableBase);
 
   log("-- createOutputSegments");
   createOutputSegments();
@@ -1776,14 +1773,18 @@ void Writer::run() {
 
   if (!ctx.arg.relocatable) {
     // Create linker synthesized functions
-    if (WasmSym::applyGlobalRelocs)
+    if (ctx.sym.applyGlobalRelocs) {
       createApplyGlobalRelocationsFunction();
-    if (WasmSym::applyTLSRelocs)
+    }
+    if (ctx.sym.applyTLSRelocs) {
       createApplyTLSRelocationsFunction();
-    if (WasmSym::applyGlobalTLSRelocs)
+    }
+    if (ctx.sym.applyGlobalTLSRelocs) {
       createApplyGlobalTLSRelocationsFunction();
-    if (WasmSym::initMemory)
+    }
+    if (ctx.sym.initMemory) {
       createInitMemoryFunction();
+    }
     createStartFunction();
 
     createCallCtorsFunction();
@@ -1794,14 +1795,14 @@ void Writer::run() {
     // the input objects or an explicit export from the command-line, we
     // assume ctors and dtors are taken care of already.
     if (!ctx.arg.relocatable && !ctx.isPic &&
-        !WasmSym::callCtors->isUsedInRegularObj &&
-        !WasmSym::callCtors->isExported()) {
+        !ctx.sym.callCtors->isUsedInRegularObj &&
+        !ctx.sym.callCtors->isExported()) {
       log("-- createCommandExportWrappers");
       createCommandExportWrappers();
     }
   }
 
-  if (WasmSym::initTLS && WasmSym::initTLS->isLive()) {
+  if (ctx.sym.initTLS && ctx.sym.initTLS->isLive()) {
     log("-- createInitTLSFunction");
     createInitTLSFunction();
   }
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index f34003eaf0fe2..ef4ec9b56f364 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -349,6 +349,15 @@ Changes to the RISC-V Backend
   extension.
 * Added ``Sdext`` and ``Sdtrig`` extensions.
 
+Changes to the SystemZ Backend
+------------------------------
+
+* Added support for the IBM z17 processor and the arch15 cpu architecture.
+* Added support for `__builtin_setjump` and `__builtin_longjmp`.
+* Improve inlining heuristics to fix compile time explosion in certain cases.
+* Improve various cost functions.
+* Improve compatibility of the assembler parser with the GNU assembler.
+
 Changes to the WebAssembly Backend
 ----------------------------------
 
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index b2a3f3390e000..06e8eb7072917 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -1245,8 +1245,11 @@ AliasResult BasicAAResult::aliasGEP(
   if (V1Size.isScalable() || V2Size.isScalable())
     return AliasResult::MayAlias;
 
-  // We need to know both acess sizes for all the following heuristics.
-  if (!V1Size.hasValue() || !V2Size.hasValue())
+  // We need to know both access sizes for all the following heuristics. Don't
+  // try to reason about sizes larger than the index space.
+  unsigned BW = DecompGEP1.Offset.getBitWidth();
+  if (!V1Size.hasValue() || !V2Size.hasValue() ||
+      !isUIntN(BW, V1Size.getValue()) || !isUIntN(BW, V2Size.getValue()))
     return AliasResult::MayAlias;
 
   APInt GCD;
@@ -1301,7 +1304,6 @@ AliasResult BasicAAResult::aliasGEP(
 
   // Compute ranges of potentially accessed bytes for both accesses. If the
   // interseciton is empty, there can be no overlap.
-  unsigned BW = OffsetRange.getBitWidth();
   ConstantRange Range1 = OffsetRange.add(
       ConstantRange(APInt(BW, 0), APInt(BW, V1Size.getValue())));
   ConstantRange Range2 =
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index e77abf429e6b4..c8f567e5f4195 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -2139,16 +2139,20 @@ void AsmPrinter::emitFunctionBody() {
 }
 
 /// Compute the number of Global Variables that uses a Constant.
-static unsigned getNumGlobalVariableUses(const Constant *C) {
-  if (!C)
+static unsigned getNumGlobalVariableUses(const Constant *C,
+                                         bool &HasNonGlobalUsers) {
+  if (!C) {
+    HasNonGlobalUsers = true;
     return 0;
+  }
 
   if (isa<GlobalVariable>(C))
     return 1;
 
   unsigned NumUses = 0;
   for (const auto *CU : C->users())
-    NumUses += getNumGlobalVariableUses(dyn_cast<Constant>(CU));
+    NumUses +=
+        getNumGlobalVariableUses(dyn_cast<Constant>(CU), HasNonGlobalUsers);
 
   return NumUses;
 }
@@ -2159,7 +2163,8 @@ static unsigned getNumGlobalVariableUses(const Constant *C) {
 /// candidates are skipped and are emitted later in case at least one cstexpr
 /// isn't replaced by a PC relative GOT entry access.
 static bool isGOTEquivalentCandidate(const GlobalVariable *GV,
-                                     unsigned &NumGOTEquivUsers) {
+                                     unsigned &NumGOTEquivUsers,
+                                     bool &HasNonGlobalUsers) {
   // Global GOT equivalents are unnamed private globals with a constant
   // pointer initializer to another global symbol. They must point to a
   // GlobalVariable or Function, i.e., as GlobalValue.
@@ -2171,7 +2176,8 @@ static bool isGOTEquivalentCandidate(const GlobalVariable *GV,
   // To be a got equivalent, at least one of its users need to be a constant
   // expression used by another global variable.
   for (const auto *U : GV->users())
-    NumGOTEquivUsers += getNumGlobalVariableUses(dyn_cast<Constant>(U));
+    NumGOTEquivUsers +=
+        getNumGlobalVariableUses(dyn_cast<Constant>(U), HasNonGlobalUsers);
 
   return NumGOTEquivUsers > 0;
 }
@@ -2189,9 +2195,13 @@ void AsmPrinter::computeGlobalGOTEquivs(Module &M) {
 
   for (const auto &G : M.globals()) {
     unsigned NumGOTEquivUsers = 0;
-    if (!isGOTEquivalentCandidate(&G, NumGOTEquivUsers))
+    bool HasNonGlobalUsers = false;
+    if (!isGOTEquivalentCandidate(&G, NumGOTEquivUsers, HasNonGlobalUsers))
       continue;
-
+    // If non-global variables use it, we still need to emit it.
+    // Add 1 here, then emit it in `emitGlobalGOTEquivs`.
+    if (HasNonGlobalUsers)
+      NumGOTEquivUsers += 1;
     const MCSymbol *GOTEquivSym = getSymbol(&G);
     GlobalGOTEquivs[GOTEquivSym] = std::make_pair(&G, NumGOTEquivUsers);
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index b416c0efbbc4f..eecfb41c2d319 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2660,16 +2660,19 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
       continue;
     }
     MachinePointerInfo PtrInfo;
+    SDValue LoadResult =
+        getLoad(Node->getValueType(ResNo), DL, CallChain, ResultPtr, PtrInfo);
+    SDValue OutChain = LoadResult.getValue(1);
+
     if (StoreSDNode *ST = ResultStores[ResNo]) {
       // Replace store with the library call.
-      ReplaceAllUsesOfValueWith(SDValue(ST, 0), CallChain);
+      ReplaceAllUsesOfValueWith(SDValue(ST, 0), OutChain);
       PtrInfo = ST->getPointerInfo();
     } else {
       PtrInfo = MachinePointerInfo::getFixedStack(
           getMachineFunction(), cast<FrameIndexSDNode>(ResultPtr)->getIndex());
     }
-    SDValue LoadResult =
-        getLoad(Node->getValueType(ResNo), DL, CallChain, ResultPtr, PtrInfo);
+
     Results.push_back(LoadResult);
   }
 
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 9c56912aa6ba0..411f59e714b0e 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1987,6 +1987,9 @@ void TargetLoweringBase::insertSSPDeclarations(Module &M) const {
 // Currently only support "standard" __stack_chk_guard.
 // TODO: add LOAD_STACK_GUARD support.
 Value *TargetLoweringBase::getSDagStackGuard(const Module &M) const {
+  if (getTargetMachine().getTargetTriple().isOSOpenBSD()) {
+    return M.getNamedValue("__guard_local");
+  }
   return M.getNamedValue("__stack_chk_guard");
 }
 
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 8432779c107de..551c00a518b8f 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -2818,6 +2818,9 @@ void Verifier::visitFunction(const Function &F) {
   Check(!Attrs.hasAttrSomewhere(Attribute::ElementType),
         "Attribute 'elementtype' can only be applied to a callsite.", &F);
 
+  Check(!Attrs.hasFnAttr("aarch64_zt0_undef"),
+        "Attribute 'aarch64_zt0_undef' can only be applied to a callsite.");
+
   if (Attrs.hasFnAttr(Attribute::Naked))
     for (const Argument &Arg : F.args())
       Check(Arg.use_empty(), "cannot use argument of naked function", &Arg);
diff --git a/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp b/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
index 8ecd669e67178..93bc6631e64c8 100644
--- a/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
@@ -116,10 +116,6 @@ uint64_t MachOLayoutBuilder::layoutSegments() {
   const bool IsObjectFile =
       O.Header.FileType == MachO::HeaderFileType::MH_OBJECT;
   uint64_t Offset = IsObjectFile ? (HeaderSize + O.Header.SizeOfCmds) : 0;
-  // If we are emitting an encryptable binary, our load commands must have a
-  // separate (non-encrypted) page to themselves.
-  bool RequiresFirstSectionOutsideFirstPage =
-      O.EncryptionInfoCommandIndex.has_value();
   for (LoadCommand &LC : O.LoadCommands) {
     auto &MLC = LC.MachOLoadCommand;
     StringRef Segname;
@@ -173,10 +169,6 @@ uint64_t MachOLayoutBuilder::layoutSegments() {
         if (!Sec->hasValidOffset()) {
           Sec->Offset = 0;
         } else {
-          if (RequiresFirstSectionOutsideFirstPage) {
-            SectOffset = alignToPowerOf2(SectOffset, PageSize);
-            RequiresFirstSectionOutsideFirstPage = false;
-          }
           Sec->Offset = SegOffset + SectOffset;
           Sec->Size = Sec->Content.size();
           SegFileSize = std::max(SegFileSize, SectOffset + Sec->Size);
diff --git a/llvm/lib/ObjCopy/MachO/MachOObject.cpp b/llvm/lib/ObjCopy/MachO/MachOObject.cpp
index e0819d89d24ff..8d2c02dc37c99 100644
--- a/llvm/lib/ObjCopy/MachO/MachOObject.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOObject.cpp
@@ -98,10 +98,6 @@ void Object::updateLoadCommandIndexes() {
     case MachO::LC_DYLD_EXPORTS_TRIE:
       ExportsTrieCommandIndex = Index;
       break;
-    case MachO::LC_ENCRYPTION_INFO:
-    case MachO::LC_ENCRYPTION_INFO_64:
-      EncryptionInfoCommandIndex = Index;
-      break;
     }
   }
 }
diff --git a/llvm/lib/ObjCopy/MachO/MachOObject.h b/llvm/lib/ObjCopy/MachO/MachOObject.h
index 79eb0133c2802..a454c4f502fd6 100644
--- a/llvm/lib/ObjCopy/MachO/MachOObject.h
+++ b/llvm/lib/ObjCopy/MachO/MachOObject.h
@@ -341,9 +341,6 @@ struct Object {
   /// The index of the LC_SEGMENT or LC_SEGMENT_64 load command
   /// corresponding to the __TEXT segment.
   std::optional<size_t> TextSegmentCommandIndex;
-  /// The index of the LC_ENCRYPTION_INFO or LC_ENCRYPTION_INFO_64 load command
-  /// if present.
-  std::optional<size_t> EncryptionInfoCommandIndex;
 
   BumpPtrAllocator Alloc;
   StringSaver NewSectionsContents;
diff --git a/llvm/lib/ObjCopy/MachO/MachOReader.cpp b/llvm/lib/ObjCopy/MachO/MachOReader.cpp
index ef0e0262f9395..2b344f36d8e78 100644
--- a/llvm/lib/ObjCopy/MachO/MachOReader.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOReader.cpp
@@ -184,10 +184,6 @@ Error MachOReader::readLoadCommands(Object &O) const {
     case MachO::LC_DYLD_CHAINED_FIXUPS:
       O.ChainedFixupsCommandIndex = O.LoadCommands.size();
       break;
-    case MachO::LC_ENCRYPTION_INFO:
-    case MachO::LC_ENCRYPTION_INFO_64:
-      O.EncryptionInfoCommandIndex = O.LoadCommands.size();
-      break;
     }
 #define HANDLE_LOAD_COMMAND(LCName, LCValue, LCStruct)                         \
   case MachO::LCName:                                                          \
diff --git a/llvm/lib/Object/MachOObjectFile.cpp b/llvm/lib/Object/MachOObjectFile.cpp
index 69d36e6a77db7..5db264207ffb7 100644
--- a/llvm/lib/Object/MachOObjectFile.cpp
+++ b/llvm/lib/Object/MachOObjectFile.cpp
@@ -192,7 +192,8 @@ static Expected<MachOObjectFile::LoadCommandInfo>
 getLoadCommandInfo(const MachOObjectFile &Obj, const char *Ptr,
                    uint32_t LoadCommandIndex) {
   if (auto CmdOrErr = getStructOrErr<MachO::load_command>(Obj, Ptr)) {
-    if (CmdOrErr->cmdsize + Ptr > Obj.getData().end())
+    assert(Ptr <= Obj.getData().end() && "Start must be before end");
+    if (CmdOrErr->cmdsize > (uintptr_t)(Obj.getData().end() - Ptr))
       return malformedError("load command " + Twine(LoadCommandIndex) +
                             " extends past end of file");
     if (CmdOrErr->cmdsize < 8)
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index d3abd79b85a75..ce947951b26ef 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2501,20 +2501,33 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
 
   // Deallocate the SVE area.
   if (SVEStackSize) {
-    // If we have stack realignment or variable sized objects on the stack,
-    // restore the stack pointer from the frame pointer prior to SVE CSR
-    // restoration.
-    if (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) {
-      if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
-        // Set SP to start of SVE callee-save area from which they can
-        // be reloaded. The code below will deallocate the stack space
-        // space by moving FP -> SP.
-        emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP,
-                        StackOffset::getScalable(-CalleeSavedSize), TII,
+    int64_t SVECalleeSavedSize = AFI->getSVECalleeSavedStackSize();
+    // If we have stack realignment or variable-sized objects we must use the
+    // FP to restore SVE callee saves (as there is an unknown amount of
+    // data/padding between the SP and SVE CS area).
+    Register BaseForSVEDealloc =
+        (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP
+                                                              : AArch64::SP;
+    if (SVECalleeSavedSize && BaseForSVEDealloc == AArch64::FP) {
+      Register CalleeSaveBase = AArch64::FP;
+      if (int64_t CalleeSaveBaseOffset =
+              AFI->getCalleeSaveBaseToFrameRecordOffset()) {
+        // If we have have an non-zero offset to the non-SVE CS base we need to
+        // compute the base address by subtracting the offest in a temporary
+        // register first (to avoid briefly deallocating the SVE CS).
+        CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister(
+            &AArch64::GPR64RegClass);
+        emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP,
+                        StackOffset::getFixed(-CalleeSaveBaseOffset), TII,
                         MachineInstr::FrameDestroy);
       }
-    } else {
-      if (AFI->getSVECalleeSavedStackSize()) {
+      // The code below will deallocate the stack space space by moving the
+      // SP to the start of the SVE callee-save area.
+      emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase,
+                      StackOffset::getScalable(-SVECalleeSavedSize), TII,
+                      MachineInstr::FrameDestroy);
+    } else if (BaseForSVEDealloc == AArch64::SP) {
+      if (SVECalleeSavedSize) {
         // Deallocate the non-SVE locals first before we can deallocate (and
         // restore callee saves) from the SVE area.
         emitFrameOffset(
@@ -3792,6 +3805,11 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
       CSStackSize += SpillSize;
   }
 
+  // Save number of saved regs, so we can easily update CSStackSize later to
+  // account for any additional 64-bit GPR saves. Note: After this point
+  // only 64-bit GPRs can be added to SavedRegs.
+  unsigned NumSavedRegs = SavedRegs.count();
+
   // Increase the callee-saved stack size if the function has streaming mode
   // changes, as we will need to spill the value of the VG register.
   // For locally streaming functions, we spill both the streaming and
@@ -3811,8 +3829,9 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   if (AFI->hasStackHazardSlotIndex())
     CSStackSize += getStackHazardSize(MF);
 
-  // Save number of saved regs, so we can easily update CSStackSize later.
-  unsigned NumSavedRegs = SavedRegs.count();
+  // If we must call __arm_get_current_vg in the prologue preserve the LR.
+  if (requiresSaveVG(MF) && !Subtarget.hasSVE())
+    SavedRegs.set(AArch64::LR);
 
   // The frame record needs to be created by saving the appropriate registers
   uint64_t EstimatedStackSize = MFI.estimateStackSize(MF);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 1387a224fa660..0aad7665f6216 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -4608,9 +4608,31 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) {
   if (ShAmt + HsAmt != 64)
     return false;
 
+  // If the input is a v1i64, widen to a v2i64 to use XAR.
+  assert((VT == MVT::v1i64 || VT == MVT::v2i64) && "Unexpected XAR type!");
+  if (VT == MVT::v1i64) {
+    EVT SVT = MVT::v2i64;
+    SDValue Undef =
+        SDValue(CurDAG->getMachineNode(AArch64::IMPLICIT_DEF, DL, SVT), 0);
+    SDValue DSub = CurDAG->getTargetConstant(AArch64::dsub, DL, MVT::i32);
+    R1 = SDValue(CurDAG->getMachineNode(AArch64::INSERT_SUBREG, DL, SVT, Undef,
+                                        R1, DSub),
+                 0);
+    if (R2.getValueType() == MVT::v1i64)
+      R2 = SDValue(CurDAG->getMachineNode(AArch64::INSERT_SUBREG, DL, SVT,
+                                          Undef, R2, DSub),
+                   0);
+  }
+
   SDValue Ops[] = {R1, R2, Imm};
-  CurDAG->SelectNodeTo(N, AArch64::XAR, N0.getValueType(), Ops);
+  SDNode *XAR = CurDAG->getMachineNode(AArch64::XAR, DL, MVT::v2i64, Ops);
 
+  if (VT == MVT::v1i64) {
+    SDValue DSub = CurDAG->getTargetConstant(AArch64::dsub, DL, MVT::i32);
+    XAR = CurDAG->getMachineNode(AArch64::EXTRACT_SUBREG, DL, VT,
+                                 SDValue(XAR, 0), DSub);
+  }
+  ReplaceNode(N, XAR);
   return true;
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp b/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
index c3bc70ad6f427..aba84574c7526 100644
--- a/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
@@ -152,12 +152,15 @@ void AArch64PointerAuth::signLR(MachineFunction &MF,
         ->setPreInstrSymbol(MF, MFnI.getSigningInstrLabel());
   } else {
     BuildPACM(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameSetup);
-    emitPACCFI(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameSetup, EmitCFI);
+    if (MFnI.branchProtectionPAuthLR())
+      emitPACCFI(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameSetup, EmitCFI);
     BuildMI(MBB, MBBI, DL,
             TII->get(MFnI.shouldSignWithBKey() ? AArch64::PACIBSP
                                                : AArch64::PACIASP))
         .setMIFlag(MachineInstr::FrameSetup)
         ->setPreInstrSymbol(MF, MFnI.getSigningInstrLabel());
+    if (!MFnI.branchProtectionPAuthLR())
+      emitPACCFI(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameSetup, EmitCFI);
   }
 
   if (!EmitCFI && NeedsWinCFI) {
@@ -220,11 +223,15 @@ void AArch64PointerAuth::authenticateLR(
           .setMIFlag(MachineInstr::FrameDestroy);
     } else {
       BuildPACM(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameDestroy, PACSym);
-      emitPACCFI(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameDestroy,
-                 EmitAsyncCFI);
+      if (MFnI->branchProtectionPAuthLR())
+        emitPACCFI(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameDestroy,
+                   EmitAsyncCFI);
       BuildMI(MBB, MBBI, DL,
               TII->get(UseBKey ? AArch64::AUTIBSP : AArch64::AUTIASP))
           .setMIFlag(MachineInstr::FrameDestroy);
+      if (!MFnI->branchProtectionPAuthLR())
+        emitPACCFI(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameDestroy,
+                   EmitAsyncCFI);
     }
 
     if (NeedsWinCFI) {
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index d1d4986d12550..80454b4f72d05 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -868,7 +868,8 @@ def ProcessorFeatures {
                                    FeatureSSBS, FeatureLS64, FeatureCLRBHB,
                                    FeatureSPECRES2, FeatureSVEAES, FeatureSVE2SM4,
                                    FeatureSVE2SHA3, FeatureSVE2, FeatureSVEBitPerm, FeatureETE,
-                                   FeatureMEC, FeatureFAMINMAX, FeatureFP8DOT2, FeatureLUT];
+                                   FeatureMEC, FeatureFAMINMAX, FeatureFP8DOT2, FeatureFP8DOT4,
+                                   FeatureFP8FMA, FeatureLUT];
   list<SubtargetFeature> Carmel   = [HasV8_2aOps, FeatureNEON, FeatureSHA2, FeatureAES,
                                      FeatureFullFP16, FeatureCRC, FeatureLSE, FeatureRAS, FeatureRDM,
                                      FeatureFPARMv8];
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index e9730348ba58e..367f6b626b420 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -18,6 +18,7 @@
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "MCTargetDesc/AArch64InstPrinter.h"
+#include "Utils/AArch64SMEAttributes.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -615,14 +616,27 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
       return true;
 
     auto &ST = MF.getSubtarget<AArch64Subtarget>();
+    const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
     if (ST.hasSVE() || ST.isStreaming()) {
-      const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
       // Frames that have variable sized objects and scalable SVE objects,
       // should always use a basepointer.
       if (!AFI->hasCalculatedStackSizeSVE() || AFI->getStackSizeSVE())
         return true;
     }
 
+    // Frames with hazard padding can have a large offset between the frame
+    // pointer and GPR locals, which includes the emergency spill slot. If the
+    // emergency spill slot is not within range of the load/store instructions
+    // (which have a signed 9-bit range), we will fail to compile if it is used.
+    // Since hasBasePointer() is called before we know if we have hazard padding
+    // or an emergency spill slot we need to enable the basepointer
+    // conservatively.
+    if (AFI->hasStackHazardSlotIndex() ||
+        (ST.getStreamingHazardSize() &&
+         !SMEAttrs(MF.getFunction()).hasNonStreamingInterfaceAndBody())) {
+      return true;
+    }
+
     // Conservatively estimate whether the negative offset from the frame
     // pointer will be sufficient to reach. If a function has a smallish
     // frame, it's less likely to have lots of spills and callee saved
@@ -747,7 +761,8 @@ AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
   assert((!MF.getSubtarget<AArch64Subtarget>().hasSVE() ||
           AFI->hasCalculatedStackSizeSVE()) &&
          "Expected SVE area to be calculated by this point");
-  return TFI.hasFP(MF) && !hasStackRealignment(MF) && !AFI->getStackSizeSVE();
+  return TFI.hasFP(MF) && !hasStackRealignment(MF) && !AFI->getStackSizeSVE() &&
+         !AFI->hasStackHazardSlotIndex();
 }
 
 bool AArch64RegisterInfo::requiresFrameIndexScavenging(
diff --git a/llvm/lib/Target/AArch64/SMEABIPass.cpp b/llvm/lib/Target/AArch64/SMEABIPass.cpp
index bb885d86392fe..b6685497e1fd1 100644
--- a/llvm/lib/Target/AArch64/SMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/SMEABIPass.cpp
@@ -54,14 +54,22 @@ FunctionPass *llvm::createSMEABIPass() { return new SMEABI(); }
 //===----------------------------------------------------------------------===//
 
 // Utility function to emit a call to __arm_tpidr2_save and clear TPIDR2_EL0.
-void emitTPIDR2Save(Module *M, IRBuilder<> &Builder) {
+void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, bool ZT0IsUndef = false) {
+  auto &Ctx = M->getContext();
   auto *TPIDR2SaveTy =
       FunctionType::get(Builder.getVoidTy(), {}, /*IsVarArgs=*/false);
-  auto Attrs = AttributeList().addFnAttribute(M->getContext(),
-                                              "aarch64_pstate_sm_compatible");
+  auto Attrs =
+      AttributeList().addFnAttribute(Ctx, "aarch64_pstate_sm_compatible");
   FunctionCallee Callee =
       M->getOrInsertFunction("__arm_tpidr2_save", TPIDR2SaveTy, Attrs);
   CallInst *Call = Builder.CreateCall(Callee);
+
+  // If ZT0 is undefined (i.e. we're at the entry of a "new_zt0" function), mark
+  // that on the __arm_tpidr2_save call. This prevents an unnecessary spill of
+  // ZT0 that can occur before ZA is enabled.
+  if (ZT0IsUndef)
+    Call->addFnAttr(Attribute::get(Ctx, "aarch64_zt0_undef"));
+
   Call->setCallingConv(
       CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0);
 
@@ -119,7 +127,7 @@ bool SMEABI::updateNewStateFunctions(Module *M, Function *F,
 
     // Create a call __arm_tpidr2_save, which commits the lazy save.
     Builder.SetInsertPoint(&SaveBB->back());
-    emitTPIDR2Save(M, Builder);
+    emitTPIDR2Save(M, Builder, /*ZT0IsUndef=*/FnAttrs.isNewZT0());
 
     // Enable pstate.za at the start of the function.
     Builder.SetInsertPoint(&OrigBB->front());
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
index bf16acd7f8f7e..76d2ac6a601e5 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
@@ -75,6 +75,8 @@ SMEAttrs::SMEAttrs(const AttributeList &Attrs) {
     Bitmask |= SM_Body;
   if (Attrs.hasFnAttr("aarch64_za_state_agnostic"))
     Bitmask |= ZA_State_Agnostic;
+  if (Attrs.hasFnAttr("aarch64_zt0_undef"))
+    Bitmask |= ZT0_Undef;
   if (Attrs.hasFnAttr("aarch64_in_za"))
     Bitmask |= encodeZAState(StateValue::In);
   if (Attrs.hasFnAttr("aarch64_out_za"))
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
index fb093da70c46b..1691d4fec8b68 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
@@ -43,9 +43,10 @@ class SMEAttrs {
     SM_Body = 1 << 2,         // aarch64_pstate_sm_body
     SME_ABI_Routine = 1 << 3, // Used for SME ABI routines to avoid lazy saves
     ZA_State_Agnostic = 1 << 4,
-    ZA_Shift = 5,
+    ZT0_Undef = 1 << 5,       // Use to mark ZT0 as undef to avoid spills
+    ZA_Shift = 6,
     ZA_Mask = 0b111 << ZA_Shift,
-    ZT0_Shift = 8,
+    ZT0_Shift = 9,
     ZT0_Mask = 0b111 << ZT0_Shift
   };
 
@@ -125,6 +126,7 @@ class SMEAttrs {
   bool isPreservesZT0() const {
     return decodeZT0State(Bitmask) == StateValue::Preserved;
   }
+  bool isUndefZT0() const { return Bitmask & ZT0_Undef; }
   bool sharesZT0() const {
     StateValue State = decodeZT0State(Bitmask);
     return State == StateValue::In || State == StateValue::Out ||
@@ -132,7 +134,7 @@ class SMEAttrs {
   }
   bool hasZT0State() const { return isNewZT0() || sharesZT0(); }
   bool requiresPreservingZT0(const SMEAttrs &Callee) const {
-    return hasZT0State() && !Callee.sharesZT0() &&
+    return hasZT0State() && !Callee.isUndefZT0() && !Callee.sharesZT0() &&
            !Callee.hasAgnosticZAInterface();
   }
   bool requiresDisablingZABeforeCall(const SMEAttrs &Callee) const {
diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td
index 244f204539c89..acf701b0f3e5d 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -109,7 +109,12 @@ def pfalse: PatFrag<(ops), (HexagonPFALSE)>;
 def pnot:   PatFrag<(ops node:$Pu), (xor node:$Pu, ptrue)>;
 
 def: Pat<(v8i1 (HexagonPFALSE)), (C2_tfrrp (A2_tfrsi (i32 0)))>;
+def: Pat<(v4i1 (HexagonPFALSE)), (C2_tfrrp (A2_tfrsi (i32 0)))>;
+def: Pat<(v2i1 (HexagonPFALSE)), (C2_tfrrp (A2_tfrsi (i32 0)))>;
+
 def: Pat<(v8i1 (HexagonPTRUE)), (C2_tfrrp (A2_tfrsi (i32 -1)))>;
+def: Pat<(v4i1 (HexagonPTRUE)), (C2_tfrrp (A2_tfrsi (i32 -1)))>;
+def: Pat<(v2i1 (HexagonPTRUE)), (C2_tfrrp (A2_tfrsi (i32 -1)))>;
 
 def valign: PatFrag<(ops node:$Vt, node:$Vs, node:$Ru),
                     (HexagonVALIGN node:$Vt, node:$Vs, node:$Ru)>;
diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index 420b98b8a9c1f..f31d85305bbbe 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -1663,6 +1663,9 @@ LoongArchAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
     return Match_Success;
   }
 
+  if (Kind == MCK_GPRNoR0R1 && (Reg == LoongArch::R0 || Reg == LoongArch::R1))
+    return Match_RequiresOpnd2NotR0R1;
+
   return Match_InvalidOperand;
 }
 
diff --git a/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp b/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp
index 5963208691f72..761682423fffe 100644
--- a/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp
+++ b/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp
@@ -62,6 +62,14 @@ static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, uint64_t RegNo,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus
+DecodeGPRNoR0R1RegisterClass(MCInst &Inst, uint64_t RegNo, uint64_t Address,
+                             const MCDisassembler *Decoder) {
+  if (RegNo <= 1)
+    return MCDisassembler::Fail;
+  return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
 static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, uint64_t RegNo,
                                              uint64_t Address,
                                              const MCDisassembler *Decoder) {
diff --git a/llvm/lib/Target/LoongArch/LoongArch.h b/llvm/lib/Target/LoongArch/LoongArch.h
index db60523738880..6635c57ff0476 100644
--- a/llvm/lib/Target/LoongArch/LoongArch.h
+++ b/llvm/lib/Target/LoongArch/LoongArch.h
@@ -35,7 +35,8 @@ bool lowerLoongArchMachineOperandToMCOperand(const MachineOperand &MO,
 
 FunctionPass *createLoongArchDeadRegisterDefinitionsPass();
 FunctionPass *createLoongArchExpandAtomicPseudoPass();
-FunctionPass *createLoongArchISelDag(LoongArchTargetMachine &TM);
+FunctionPass *createLoongArchISelDag(LoongArchTargetMachine &TM,
+                                     CodeGenOptLevel OptLevel);
 FunctionPass *createLoongArchMergeBaseOffsetOptPass();
 FunctionPass *createLoongArchOptWInstrsPass();
 FunctionPass *createLoongArchPreRAExpandPseudoPass();
diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
index 895a8e2646692..9a383f0a79a5c 100644
--- a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
@@ -265,13 +265,16 @@ void LoongArchAsmPrinter::emitJumpTableInfo() {
 
   assert(TM.getTargetTriple().isOSBinFormatELF());
 
-  unsigned Size = getDataLayout().getPointerSize();
   auto *LAFI = MF->getInfo<LoongArchMachineFunctionInfo>();
   unsigned EntrySize = LAFI->getJumpInfoSize();
+  auto JTI = MF->getJumpTableInfo();
 
-  if (0 == EntrySize)
+  if (!JTI || 0 == EntrySize)
     return;
 
+  unsigned Size = getDataLayout().getPointerSize();
+  auto JT = JTI->getJumpTables();
+
   // Emit an additional section to store the correlation info as pairs of
   // addresses, each pair contains the address of a jump instruction (jr) and
   // the address of the jump table.
@@ -279,14 +282,15 @@ void LoongArchAsmPrinter::emitJumpTableInfo() {
       ".discard.tablejump_annotate", ELF::SHT_PROGBITS, 0));
 
   for (unsigned Idx = 0; Idx < EntrySize; ++Idx) {
+    int JTIIdx = LAFI->getJumpInfoJTIIndex(Idx);
+    if (JT[JTIIdx].MBBs.empty())
+      continue;
     OutStreamer->emitValue(
         MCSymbolRefExpr::create(LAFI->getJumpInfoJrMI(Idx)->getPreInstrSymbol(),
                                 OutContext),
         Size);
     OutStreamer->emitValue(
-        MCSymbolRefExpr::create(
-            GetJTISymbol(LAFI->getJumpInfoJTIMO(Idx)->getIndex()), OutContext),
-        Size);
+        MCSymbolRefExpr::create(GetJTISymbol(JTIIdx), OutContext), Size);
   }
 }
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
index c2d73a260b1c1..2107908be34ca 100644
--- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
@@ -638,7 +638,8 @@ void LoongArchPreRAExpandPseudo::annotateTableJump(
         if (MO.isJTI()) {
           MBBI->setPreInstrSymbol(
               *MF, MF->getContext().createNamedTempSymbol("jrtb_"));
-          MF->getInfo<LoongArchMachineFunctionInfo>()->setJumpInfo(&*MBBI, &MO);
+          MF->getInfo<LoongArchMachineFunctionInfo>()->setJumpInfo(
+              &*MBBI, MO.getIndex());
           IsFound = true;
           return;
         }
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
index cb0fb9bc9c7f9..7169cdc9a2bf9 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
@@ -25,8 +25,9 @@ using namespace llvm;
 char LoongArchDAGToDAGISelLegacy::ID;
 
 LoongArchDAGToDAGISelLegacy::LoongArchDAGToDAGISelLegacy(
-    LoongArchTargetMachine &TM)
-    : SelectionDAGISelLegacy(ID, std::make_unique<LoongArchDAGToDAGISel>(TM)) {}
+    LoongArchTargetMachine &TM, CodeGenOptLevel OptLevel)
+    : SelectionDAGISelLegacy(
+          ID, std::make_unique<LoongArchDAGToDAGISel>(TM, OptLevel)) {}
 
 INITIALIZE_PASS(LoongArchDAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false,
                 false)
@@ -456,6 +457,7 @@ bool LoongArchDAGToDAGISel::selectVSplatUimmPow2(SDValue N,
 
 // This pass converts a legalized DAG into a LoongArch-specific DAG, ready
 // for instruction scheduling.
-FunctionPass *llvm::createLoongArchISelDag(LoongArchTargetMachine &TM) {
-  return new LoongArchDAGToDAGISelLegacy(TM);
+FunctionPass *llvm::createLoongArchISelDag(LoongArchTargetMachine &TM,
+                                           CodeGenOptLevel OptLevel) {
+  return new LoongArchDAGToDAGISelLegacy(TM, OptLevel);
 }
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h
index 8a7eba418d804..2e6bc9951e9e7 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h
@@ -26,8 +26,9 @@ class LoongArchDAGToDAGISel : public SelectionDAGISel {
 public:
   LoongArchDAGToDAGISel() = delete;
 
-  explicit LoongArchDAGToDAGISel(LoongArchTargetMachine &TM)
-      : SelectionDAGISel(TM) {}
+  explicit LoongArchDAGToDAGISel(LoongArchTargetMachine &TM,
+                                 CodeGenOptLevel OptLevel)
+      : SelectionDAGISel(TM, OptLevel) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override {
     Subtarget = &MF.getSubtarget<LoongArchSubtarget>();
@@ -71,7 +72,8 @@ class LoongArchDAGToDAGISel : public SelectionDAGISel {
 class LoongArchDAGToDAGISelLegacy : public SelectionDAGISelLegacy {
 public:
   static char ID;
-  explicit LoongArchDAGToDAGISelLegacy(LoongArchTargetMachine &TM);
+  explicit LoongArchDAGToDAGISelLegacy(LoongArchTargetMachine &TM,
+                                       CodeGenOptLevel OptLevel);
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 4ed3c3cf92e3e..98b7a1126e560 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -1209,7 +1209,7 @@ static SDValue lowerVECTOR_SHUFFLE_XVSHUF(const SDLoc &DL, ArrayRef<int> Mask,
     if (*it < 0) // UNDEF
       MaskAlloc.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
     else if ((*it >= 0 && *it < HalfSize) ||
-             (*it >= MaskSize && *it <= MaskSize + HalfSize)) {
+             (*it >= MaskSize && *it < MaskSize + HalfSize)) {
       int M = *it < HalfSize ? *it : *it - HalfSize;
       MaskAlloc.push_back(DAG.getTargetConstant(M, DL, MVT::i64));
     } else
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index 9b93a9f824726..00e8548071182 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -2351,7 +2351,7 @@ let hasSideEffects = 1, Constraints = "$rd = $dst" in {
 def CSRWR : FmtCSR<0x04000020, (outs GPR:$dst),
                    (ins GPR:$rd, uimm14:$csr_num), "$rd, $csr_num">;
 def CSRXCHG : FmtCSRXCHG<0x04000000, (outs GPR:$dst),
-                         (ins GPR:$rd, GPR:$rj, uimm14:$csr_num),
+                         (ins GPR:$rd, GPRNoR0R1:$rj, uimm14:$csr_num),
                          "$rd, $rj, $csr_num">;
 } // hasSideEffects = 1, Constraints = "$rd = $dst"
 
@@ -2398,8 +2398,8 @@ def IDLE : MISC_I15<0x06488000>;
 def : Pat<(loongarch_csrrd uimm14:$imm14), (CSRRD uimm14:$imm14)>;
 def : Pat<(loongarch_csrwr GPR:$rd, uimm14:$imm14),
           (CSRWR GPR:$rd, uimm14:$imm14)>;
-def : Pat<(loongarch_csrxchg GPR:$rd, GPR:$rj, uimm14:$imm14),
-          (CSRXCHG GPR:$rd, GPR:$rj, uimm14:$imm14)>;
+def : Pat<(loongarch_csrxchg GPR:$rd, GPRNoR0R1:$rj, uimm14:$imm14),
+          (CSRXCHG GPR:$rd, GPRNoR0R1:$rj, uimm14:$imm14)>;
 
 def : Pat<(loongarch_iocsrrd_b GPR:$rj), (IOCSRRD_B GPR:$rj)>;
 def : Pat<(loongarch_iocsrrd_h GPR:$rj), (IOCSRRD_H GPR:$rj)>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index 7022fddf34100..9b515a2721d7f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1792,24 +1792,24 @@ def : Pat<(v4f32 (uint_to_fp v4i64:$vj)),
 // XVFTINTRZ_{W_S/L_D}
 def : Pat<(v8i32 (fp_to_sint v8f32:$vj)), (XVFTINTRZ_W_S v8f32:$vj)>;
 def : Pat<(v4i64 (fp_to_sint v4f64:$vj)), (XVFTINTRZ_L_D v4f64:$vj)>;
-def : Pat<(v4i64 (fp_to_sint v4f32:$vj)),
-          (VEXT2XV_D_W (SUBREG_TO_REG (i64 0), (VFTINTRZ_W_S v4f32:$vj),
-                                      sub_128))>;
-def : Pat<(v4i32 (fp_to_sint (v4f64 LASX256:$vj))),
-          (EXTRACT_SUBREG (XVFTINTRZ_W_S (XVFCVT_S_D (XVPERMI_D v4f64:$vj, 238),
-                                                     v4f64:$vj)),
-                          sub_128)>;
+def : Pat<(v4i64(fp_to_sint v4f32:$vj)), (VEXT2XV_D_W(SUBREG_TO_REG(i64 0),
+                                             (VFTINTRZ_W_S v4f32:$vj),
+                                             sub_128))>;
+def : Pat<(v4i32(fp_to_sint v4f64:$vj)),
+          (EXTRACT_SUBREG(XVPICKEV_W(XVPERMI_D(XVFTINTRZ_L_D v4f64:$vj), 238),
+               (XVFTINTRZ_L_D v4f64:$vj)),
+              sub_128)>;
 
 // XVFTINTRZ_{W_SU/L_DU}
 def : Pat<(v8i32 (fp_to_uint v8f32:$vj)), (XVFTINTRZ_WU_S v8f32:$vj)>;
 def : Pat<(v4i64 (fp_to_uint v4f64:$vj)), (XVFTINTRZ_LU_D v4f64:$vj)>;
-def : Pat<(v4i64 (fp_to_uint v4f32:$vj)),
-          (VEXT2XV_DU_WU (SUBREG_TO_REG (i64 0), (VFTINTRZ_WU_S v4f32:$vj),
-                                        sub_128))>;
-def : Pat<(v4i32 (fp_to_uint (v4f64 LASX256:$vj))),
-          (EXTRACT_SUBREG (XVFTINTRZ_W_S (XVFCVT_S_D (XVPERMI_D v4f64:$vj, 238),
-                                                     v4f64:$vj)),
-                          sub_128)>;
+def : Pat<(v4i64(fp_to_uint v4f32:$vj)), (VEXT2XV_DU_WU(SUBREG_TO_REG(i64 0),
+                                             (VFTINTRZ_WU_S v4f32:$vj),
+                                             sub_128))>;
+def : Pat<(v4i32(fp_to_uint v4f64:$vj)),
+          (EXTRACT_SUBREG(XVPICKEV_W(XVPERMI_D(XVFTINTRZ_LU_D v4f64:$vj), 238),
+               (XVFTINTRZ_LU_D v4f64:$vj)),
+              sub_128)>;
 
 // XVPERMI_Q
 foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in
diff --git a/llvm/lib/Target/LoongArch/LoongArchLVZInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLVZInstrInfo.td
index 50a16e2dd56b9..07b77ee971f27 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLVZInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLVZInstrInfo.td
@@ -23,7 +23,7 @@ let Constraints = "$rd = $dst" in {
 def GCSRWR : FmtCSR<0x05000020, (outs GPR:$dst),
                     (ins GPR:$rd, uimm14:$csr_num), "$rd, $csr_num">;
 def GCSRXCHG : FmtCSRXCHG<0x05000000, (outs GPR:$dst),
-                          (ins GPR:$rd, GPR:$rj, uimm14:$csr_num),
+                          (ins GPR:$rd, GPRNoR0R1:$rj, uimm14:$csr_num),
                           "$rd, $rj, $csr_num">;
 } // Constraints = "$rd = $dst"
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h b/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h
index daa47c4dc7e32..904985c189dba 100644
--- a/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h
@@ -41,7 +41,7 @@ class LoongArchMachineFunctionInfo : public MachineFunctionInfo {
 
   /// Pairs of `jr` instructions and corresponding JTI operands, used for the
   /// `annotate-tablejump` option.
-  SmallVector<std::pair<MachineInstr *, MachineOperand *>, 4> JumpInfos;
+  SmallVector<std::pair<MachineInstr *, int>, 4> JumpInfos;
 
 public:
   LoongArchMachineFunctionInfo(const Function &F,
@@ -76,14 +76,12 @@ class LoongArchMachineFunctionInfo : public MachineFunctionInfo {
     return is_contained(SExt32Registers, Reg);
   }
 
-  void setJumpInfo(MachineInstr *JrMI, MachineOperand *JTIMO) {
-    JumpInfos.push_back(std::make_pair(JrMI, JTIMO));
+  void setJumpInfo(MachineInstr *JrMI, int JTIIdx) {
+    JumpInfos.push_back(std::make_pair(JrMI, JTIIdx));
   }
   unsigned getJumpInfoSize() { return JumpInfos.size(); }
   MachineInstr *getJumpInfoJrMI(unsigned Idx) { return JumpInfos[Idx].first; }
-  MachineOperand *getJumpInfoJTIMO(unsigned Idx) {
-    return JumpInfos[Idx].second;
-  }
+  int getJumpInfoJTIIndex(unsigned Idx) { return JumpInfos[Idx].second; }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.td b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.td
index a8419980868ee..2a8cdf953e00f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.td
@@ -127,6 +127,11 @@ def GPRT : GPRRegisterClass<(add // a0...a7, t0...t8
 // prediction.
 def GPRJR : GPRRegisterClass<(sub GPR, R1)>;
 
+// Don't use R0 or R1 for the rj operand of [G]CSRXCHG, because when rj is
+// encoded as 0 or 1, the instruction is interpreted as [G]CSRRD or [G]CSRWR,
+// respectively, rather than [G]CSRXCHG.
+def GPRNoR0R1 : GPRRegisterClass<(sub GPR, R0, R1)>;
+
 // Floating point registers
 
 let RegAltNameIndices = [RegAliasName] in {
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index 62b08be5435cd..27f97b2cebb0c 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -188,7 +188,7 @@ void LoongArchPassConfig::addCodeGenPrepare() {
 }
 
 bool LoongArchPassConfig::addInstSelector() {
-  addPass(createLoongArchISelDag(getLoongArchTargetMachine()));
+  addPass(createLoongArchISelDag(getLoongArchTargetMachine(), getOptLevel()));
 
   return false;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index bb2e5781c34db..6f4c1e16190f4 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -2135,11 +2135,13 @@ TargetStackID::Value RISCVFrameLowering::getStackIDForScalableVectors() const {
 }
 
 // Synthesize the probe loop.
-static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MBBI, DebugLoc DL,
+static void emitStackProbeInline(MachineBasicBlock::iterator MBBI, DebugLoc DL,
                                  Register TargetReg, bool IsRVV) {
   assert(TargetReg != RISCV::X2 && "New top of stack cannot already be in SP");
 
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  MachineFunction &MF = *MBB.getParent();
+
   auto &Subtarget = MF.getSubtarget<RISCVSubtarget>();
   const RISCVInstrInfo *TII = Subtarget.getInstrInfo();
   bool IsRV64 = Subtarget.is64Bit();
@@ -2228,7 +2230,7 @@ void RISCVFrameLowering::inlineStackProbe(MachineFunction &MF,
       MachineBasicBlock::iterator MBBI = MI->getIterator();
       DebugLoc DL = MBB.findDebugLoc(MBBI);
       Register TargetReg = MI->getOperand(1).getReg();
-      emitStackProbeInline(MF, MBB, MBBI, DL, TargetReg,
+      emitStackProbeInline(MBBI, DL, TargetReg,
                            (MI->getOpcode() == RISCV::PROBED_STACKALLOC_RVV));
       MBBI->eraseFromParent();
     }
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 2b8269e440e90..049865c81667a 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -254,7 +254,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::ROTR,      MVT::i128, Expand);
     setOperationAction(ISD::ROTL,      MVT::i128, Expand);
 
-    // No special instructions for these before arch15.
+    // No special instructions for these before z17.
     if (!Subtarget.hasVectorEnhancements3()) {
       setOperationAction(ISD::MUL,   MVT::i128, Expand);
       setOperationAction(ISD::MULHS, MVT::i128, Expand);
@@ -281,7 +281,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
     // Use VPOPCT and add up partial results.
     setOperationAction(ISD::CTPOP, MVT::i128, Custom);
 
-    // Additional instructions available with arch15.
+    // Additional instructions available with z17.
     if (Subtarget.hasVectorEnhancements3()) {
       setOperationAction(ISD::ABS, MVT::i128, Legal);
     }
@@ -353,7 +353,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Promote);
   setOperationAction(ISD::CTLZ, MVT::i64, Legal);
 
-  // On arch15 we have native support for a 64-bit CTTZ.
+  // On z17 we have native support for a 64-bit CTTZ.
   if (Subtarget.hasMiscellaneousExtensions4()) {
     setOperationAction(ISD::CTTZ, MVT::i32, Promote);
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Promote);
@@ -4466,7 +4466,7 @@ SDValue SystemZTargetLowering::lowerMULH(SDValue Op,
   SDLoc DL(Op);
   SDValue Even, Odd;
 
-  // This custom expander is only used on arch15 and later for 64-bit types.
+  // This custom expander is only used on z17 and later for 64-bit types.
   assert(!is32Bit(VT));
   assert(Subtarget.hasMiscellaneousExtensions2());
 
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index edd20a5de8c63..a4ece32c79d88 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -1973,7 +1973,7 @@ let Predicates = [FeatureVector] in {
             (VLEG (VGBM 0), bdxaddr12only:$addr, 1)>;
 }
 
-// In-register i128 sign-extensions on arch15.
+// In-register i128 sign-extensions on z17.
 let Predicates = [FeatureVectorEnhancements3] in {
   def : Pat<(i128 (sext_inreg VR128:$x, i8)), (VUPLG (VSEGB VR128:$x))>;
   def : Pat<(i128 (sext_inreg VR128:$x, i16)), (VUPLG (VSEGH VR128:$x))>;
@@ -1993,7 +1993,7 @@ let Predicates = [FeatureVector] in {
             (VSRAB (VREPG VR128:$x, 1), (VREPIB 64))>;
 }
 
-// Sign-extensions from GPR to i128 on arch15.
+// Sign-extensions from GPR to i128 on z17.
 let Predicates = [FeatureVectorEnhancements3] in {
   def : Pat<(i128 (sext_inreg (anyext GR32:$x), i8)),
             (VUPLG (VLVGP (LGBR (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$x, subreg_l32)),
diff --git a/llvm/lib/Target/SystemZ/SystemZProcessors.td b/llvm/lib/Target/SystemZ/SystemZProcessors.td
index 75b6671dc7723..0827701a48b5a 100644
--- a/llvm/lib/Target/SystemZ/SystemZProcessors.td
+++ b/llvm/lib/Target/SystemZ/SystemZProcessors.td
@@ -41,4 +41,5 @@ def : ProcessorModel<"z15", Z15Model, Arch13SupportedFeatures.List>;
 def : ProcessorModel<"arch14", Z16Model, Arch14SupportedFeatures.List>;
 def : ProcessorModel<"z16", Z16Model, Arch14SupportedFeatures.List>;
 
-def : ProcessorModel<"arch15", Z16Model, Arch15SupportedFeatures.List>;
+def : ProcessorModel<"arch15", Z17Model, Arch15SupportedFeatures.List>;
+def : ProcessorModel<"z17", Z17Model, Arch15SupportedFeatures.List>;
diff --git a/llvm/lib/Target/SystemZ/SystemZSchedule.td b/llvm/lib/Target/SystemZ/SystemZSchedule.td
index d683cc042e5c9..cc03a71d8a649 100644
--- a/llvm/lib/Target/SystemZ/SystemZSchedule.td
+++ b/llvm/lib/Target/SystemZ/SystemZSchedule.td
@@ -60,6 +60,7 @@ def VBU : SchedWrite; // Virtual branching unit
 
 def MCD : SchedWrite; // Millicode
 
+include "SystemZScheduleZ17.td"
 include "SystemZScheduleZ16.td"
 include "SystemZScheduleZ15.td"
 include "SystemZScheduleZ14.td"
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
index 2c01691707cc3..a9354ea76c72c 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
@@ -1555,12 +1555,12 @@ def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "V(T|C)P$")>;
 
 def : InstRW<[WLat30, VecDF2, NormalGr], (instregex "VSCH(S|D|X)?P$")>;
 def : InstRW<[WLat30, VecDF2, NormalGr], (instregex "VSCSHP$")>;
-def : InstRW<[WLat30, VecDF2, NormalGr], (instregex "VCSPH")>;
-def : InstRW<[WLat2, WLat2, VecXsPm, NormalGr], (instregex "VCLZDP")>;
-def : InstRW<[WLat10, WLat10, VecDF2, NormalGr], (instregex "VSRPR")>;
-def : InstRW<[WLat2, WLat2, VecDFX, NormalGr], (instregex "VPKZR")>;
-def : InstRW<[WLat2, WLat2, VecDFX, NormalGr], (instregex "VUPKZH")>;
-def : InstRW<[WLat2, WLat2, VecDFX, NormalGr], (instregex "VUPKZL")>;
+def : InstRW<[WLat30, VecDF2, NormalGr], (instregex "VCSPH$")>;
+def : InstRW<[WLat2, WLat2, VecXsPm, NormalGr], (instregex "VCLZDP$")>;
+def : InstRW<[WLat10, WLat10, VecDF2, NormalGr], (instregex "VSRPR$")>;
+def : InstRW<[WLat2, WLat2, VecDFX, NormalGr], (instregex "VPKZR$")>;
+def : InstRW<[WLat2, WLat2, VecDFX, NormalGr], (instregex "VUPKZH$")>;
+def : InstRW<[WLat2, WLat2, VecDFX, NormalGr], (instregex "VUPKZL$")>;
 
 // -------------------------------- System ---------------------------------- //
 
@@ -1597,8 +1597,8 @@ def : InstRW<[WLat30, MCD], (instregex "S(T)?PX$")>;
 // System: Breaking-Event-Address-Register Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[WLat3LSU, LSU2, GroupAlone], (instregex "LBEAR")>;
-def : InstRW<[WLat1, LSU2, FXb, GroupAlone], (instregex "STBEAR")>;
+def : InstRW<[WLat3LSU, LSU2, GroupAlone], (instregex "LBEAR$")>;
+def : InstRW<[WLat1, LSU2, FXb, GroupAlone], (instregex "STBEAR$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Storage-Key and Real Memory Instructions
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ17.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ17.td
new file mode 100644
index 0000000000000..bd52627f636a7
--- /dev/null
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ17.td
@@ -0,0 +1,1754 @@
+//--- SystemZScheduleZ17.td - SystemZ Scheduling Definitions ---*- tblgen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Z17 to support instruction
+// scheduling and other instruction cost heuristics.
+//
+// Pseudos expanded right after isel do not need to be modelled here.
+//
+//===----------------------------------------------------------------------===//
+
+def Z17Model : SchedMachineModel {
+
+    let UnsupportedFeatures = Arch15UnsupportedFeatures.List;
+
+    let IssueWidth = 6;             // Number of instructions decoded per cycle.
+    let MicroOpBufferSize = 60;     // Issue queues
+    let LoadLatency = 1;            // Optimistic load latency.
+
+    let PostRAScheduler = 1;
+
+    // Extra cycles for a mispredicted branch.
+    let MispredictPenalty = 20;
+}
+
+let SchedModel = Z17Model in  {
+// These definitions need the SchedModel value. They could be put in a
+// subtarget common include file, but it seems the include system in Tablegen
+// currently (2016) rejects multiple includes of same file.
+
+// Decoder grouping rules
+let NumMicroOps = 1 in {
+  def : WriteRes<NormalGr, []>;
+  def : WriteRes<BeginGroup, []> { let BeginGroup  = 1; }
+  def : WriteRes<EndGroup, []>   { let EndGroup    = 1; }
+}
+def : WriteRes<Cracked, []> {
+  let NumMicroOps = 2;
+  let BeginGroup  = 1;
+}
+def : WriteRes<GroupAlone, []> {
+  let NumMicroOps = 3;
+  let BeginGroup  = 1;
+  let EndGroup    = 1;
+}
+def : WriteRes<GroupAlone2, []> {
+  let NumMicroOps = 6;
+  let BeginGroup  = 1;
+  let EndGroup    = 1;
+}
+def : WriteRes<GroupAlone3, []> {
+  let NumMicroOps = 9;
+  let BeginGroup  = 1;
+  let EndGroup    = 1;
+}
+
+// Incoming latency removed from the register operand which is used together
+// with a memory operand by the instruction.
+def : ReadAdvance<RegReadAdv, 4>;
+
+// LoadLatency (above) is not used for instructions in this file. This is
+// instead the role of LSULatency, which is the latency value added to the
+// result of loads and instructions with folded memory operands.
+def : WriteRes<LSULatency, []> { let Latency = 4; let NumMicroOps = 0; }
+
+let NumMicroOps = 0 in {
+  foreach L = 1-30 in
+    def : WriteRes<!cast<SchedWrite>("WLat"#L), []> { let Latency = L; }
+}
+
+// Execution units.
+def Z17_FXaUnit     : ProcResource<2>;
+def Z17_FXbUnit     : ProcResource<2>;
+def Z17_LSUnit      : ProcResource<2>;
+def Z17_VecUnit     : ProcResource<2>;
+def Z17_VecFPdUnit  : ProcResource<2> { let BufferSize = 1; /* blocking */ }
+def Z17_VBUnit      : ProcResource<2>;
+def Z17_MCD         : ProcResource<1>;
+
+// Subtarget specific definitions of scheduling resources.
+let NumMicroOps = 0 in {
+  def : WriteRes<FXa, [Z17_FXaUnit]>;
+  def : WriteRes<FXb, [Z17_FXbUnit]>;
+  def : WriteRes<LSU, [Z17_LSUnit]>;
+  def : WriteRes<VecBF,  [Z17_VecUnit]>;
+  def : WriteRes<VecDF,  [Z17_VecUnit]>;
+  def : WriteRes<VecDFX, [Z17_VecUnit]>;
+  def : WriteRes<VecMul,  [Z17_VecUnit]>;
+  def : WriteRes<VecStr,  [Z17_VecUnit]>;
+  def : WriteRes<VecXsPm, [Z17_VecUnit]>;
+  foreach Num = 2-5 in { let ReleaseAtCycles = [Num] in {
+    def : WriteRes<!cast<SchedWrite>("FXa"#Num), [Z17_FXaUnit]>;
+    def : WriteRes<!cast<SchedWrite>("FXb"#Num), [Z17_FXbUnit]>;
+    def : WriteRes<!cast<SchedWrite>("LSU"#Num), [Z17_LSUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecBF"#Num), [Z17_VecUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecDF"#Num), [Z17_VecUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecDFX"#Num), [Z17_VecUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecMul"#Num), [Z17_VecUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecStr"#Num), [Z17_VecUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecXsPm"#Num), [Z17_VecUnit]>;
+  }}
+
+  def : WriteRes<VecFPd,   [Z17_VecFPdUnit]> { let ReleaseAtCycles = [30]; }
+  def : WriteRes<VecFPd20, [Z17_VecFPdUnit]> { let ReleaseAtCycles = [20]; }
+
+  def : WriteRes<VBU,     [Z17_VBUnit]>; // Virtual Branching Unit
+}
+
+def : WriteRes<MCD, [Z17_MCD]> { let NumMicroOps = 3;
+                                 let BeginGroup  = 1;
+                                 let EndGroup    = 1; }
+
+// -------------------------- INSTRUCTIONS ---------------------------------- //
+
+// InstRW constructs have been used in order to preserve the
+// readability of the InstrInfo files.
+
+// For each instruction, as matched by a regexp, provide a list of
+// resources that it needs. These will be combined into a SchedClass.
+
+//===----------------------------------------------------------------------===//
+// Stack allocation
+//===----------------------------------------------------------------------===//
+
+// Pseudo -> LA / LAY
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ADJDYNALLOC$")>;
+
+//===----------------------------------------------------------------------===//
+// Branch instructions
+//===----------------------------------------------------------------------===//
+
+// Branch
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Call)?J(G)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "(Call)?B(R)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "BI(C)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXa, EndGroup], (instregex "BRCT(G)?$")>;
+def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BRCTH$")>;
+def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BCT(G)?(R)?$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "B(R)?X(H|L).*$")>;
+
+// Compare and branch
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb2, GroupAlone],
+             (instregex "C(L)?(G)?(I|R)B(Call|Return|Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Trap instructions
+//===----------------------------------------------------------------------===//
+
+// Trap
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Cond)?Trap$")>;
+
+// Compare and trap
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CL(G)?RT(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CL(F|G)IT(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "CL(G)?T(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Call and return instructions
+//===----------------------------------------------------------------------===//
+
+// Call
+def : InstRW<[WLat1, VBU, FXa2, GroupAlone], (instregex "(Call)?BRAS$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BRASL(_XPLINK64)?$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BAS(R)?(_XPLINK64|_STACKEXT)?$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
+
+// Return
+def : InstRW<[WLat1, FXb, EndGroup], (instregex "Return(_XPLINK)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CondReturn(_XPLINK)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+// Moves
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "MV(G|H)?HI$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "MVI(Y)?$")>;
+
+// Move character
+def : InstRW<[WLat1, FXb, LSU3, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVCL(E|U)?$")>;
+def : InstRW<[WLat1, LSU2, GroupAlone], (instregex "MVCRL$")>;
+
+// Pseudo -> reg move
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "COPY(_TO_REGCLASS)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "EXTRACT_SUBREG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "INSERT_SUBREG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "REG_SEQUENCE$")>;
+
+// Loads
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L(Y|FH|RL|Mux)?$")>;
+def : InstRW<[LSULatency, LSULatency, LSU, NormalGr], (instregex "LCBB$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LG(RL)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L128$")>;
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLIH(F|H|L)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLIL(F|H|L)$")>;
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LG(F|H)I$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LHI(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LR$")>;
+
+// Load and zero rightmost byte
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LZR(F|G)$")>;
+
+// Load and trap
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "L(FH|G)?AT$")>;
+
+// Load and test
+def : InstRW<[WLat1LSU, WLat1LSU, LSU, FXa, NormalGr], (instregex "LT(G)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LT(G)?R$")>;
+
+// Stores
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STG(RL)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST128$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(Y|FH|RL|Mux)?$")>;
+
+// String moves.
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVST$")>;
+
+//===----------------------------------------------------------------------===//
+// Conditional move instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOCRMux$")>;
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOC(G|FH)?R(Asm.*)?$")>;
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOC(G|H)?HI(Mux|(Asm.*))?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "LOC(G|FH|Mux)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr],
+             (instregex "STOC(G|FH|Mux)?(Asm.*)?$")>;
+
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "SELRMux$")>;
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "SEL(G|FH)?R(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Sign extensions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "L(B|H|G)R$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LG(B|H|F)R$")>;
+
+def : InstRW<[WLat1LSU, WLat1LSU, FXa, LSU, NormalGr], (instregex "LTGF$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LTGFR$")>;
+
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LB(H|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LH(Y)?$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LH(H|Mux|RL)$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LG(B|H|F)$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LG(H|F)RL$")>;
+
+//===----------------------------------------------------------------------===//
+// Zero extensions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLCR(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLHR(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLG(C|H|F|T)R$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLC(Mux)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLH(Mux)?$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LL(C|H)H$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLHRL$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLG(C|H|F|T|HRL|FRL)$")>;
+
+// Load and zero rightmost byte
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLZRGF$")>;
+
+// Load and trap
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "LLG(F|T)?AT$")>;
+
+//===----------------------------------------------------------------------===//
+// Truncations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STC(H|Y|Mux)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STCM(H|Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Multi-register moves
+//===----------------------------------------------------------------------===//
+
+// Load multiple (estimated average of 5 ops)
+def : InstRW<[WLat10, WLat10, LSU5, GroupAlone], (instregex "LM(H|Y|G)?$")>;
+
+// Load multiple disjoint
+def : InstRW<[WLat30, WLat30, MCD], (instregex "LMD$")>;
+
+// Store multiple
+def : InstRW<[WLat1, LSU2, FXb3, GroupAlone], (instregex "STM(G|H|Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Byte swaps
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LRV(G)?R$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LRV(G|H)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STRV(G|H)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVCIN$")>;
+
+//===----------------------------------------------------------------------===//
+// Load address instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LA(Y|RL)?$")>;
+
+// Load the Global Offset Table address ( -> larl )
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "GOT$")>;
+
+// Load (logical) indexed address.
+def : InstRW<[WLat2, FXa2, NormalGr], (instregex "(L)?LXA(B|H|F|G|Q)$")>;
+
+//===----------------------------------------------------------------------===//
+// Absolute and Negation
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, WLat1, FXa, NormalGr], (instregex "LP(G)?R$")>;
+def : InstRW<[WLat2, WLat2, FXa2, Cracked], (instregex "L(N|P)GFR$")>;
+def : InstRW<[WLat1, WLat1, FXa, NormalGr], (instregex "LN(R|GR)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LC(R|GR)$")>;
+def : InstRW<[WLat2, WLat2, FXa2, Cracked], (instregex "LCGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Insertion
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "IC(Y)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "IC32(Y)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, WLat1LSU, FXa, LSU, NormalGr],
+             (instregex "ICM(H|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "II(F|H|L)Mux$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILL(64)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Addition
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "A(Y)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "AH(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AIH$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AFI(Mux)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "AG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGFI$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGHI(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AHI(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AHIMux(K)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "AL(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AL(FI|HSIK)$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "ALG(F)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGHSIK$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGF(I|R)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "A(L)?HHHR$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "A(L)?HHLR$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALSIH(N)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "A(L)?(G)?SI$")>;
+
+// Logical addition with carry
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, GroupAlone],
+             (instregex "ALC(G)?$")>;
+def : InstRW<[WLat2, WLat2, FXa, GroupAlone], (instregex "ALC(G)?R$")>;
+
+// Add with sign extension (16/32 -> 64)
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "AG(F|H)$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "AGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Subtraction
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "S(G|Y)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "SH(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLFI$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "SL(G|GF|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLGF(I|R)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "S(L)?HHHR$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "S(L)?HHLR$")>;
+
+// Subtraction with borrow
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, GroupAlone],
+             (instregex "SLB(G)?$")>;
+def : InstRW<[WLat2, WLat2, FXa, GroupAlone], (instregex "SLB(G)?R$")>;
+
+// Subtraction with sign extension (16/32 -> 64)
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "SG(F|H)$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "SGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// AND
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "N(G|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NI(FMux|HMux|LMux)$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "NI(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NR(K)?$")>;
+def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "NC$")>;
+
+//===----------------------------------------------------------------------===//
+// OR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "O(G|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OGR(K)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "OI(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OI(FMux|HMux|LMux)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OR(K)?$")>;
+def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "OC$")>;
+
+//===----------------------------------------------------------------------===//
+// XOR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "X(G|Y)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "XI(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XIFMux$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XR(K)?$")>;
+def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "XC$")>;
+
+//===----------------------------------------------------------------------===//
+// Combined logical operations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NC(G)?RK$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OC(G)?RK$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NN(G)?RK$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NO(G)?RK$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NOT(G)?R$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NX(G)?RK$")>;
+
+//===----------------------------------------------------------------------===//
+// Multiplication
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat4LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "MS(GF|Y)?$")>;
+def : InstRW<[WLat4, FXa, NormalGr], (instregex "MS(R|FI)$")>;
+def : InstRW<[WLat4LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "MSG$")>;
+def : InstRW<[WLat4, FXa, NormalGr], (instregex "MSGR$")>;
+def : InstRW<[WLat4, FXa, NormalGr], (instregex "MSGF(I|R)$")>;
+def : InstRW<[WLat5LSU, RegReadAdv, FXa2, LSU, GroupAlone], (instregex "MLG$")>;
+def : InstRW<[WLat5, FXa2, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[WLat4, FXa, NormalGr], (instregex "MGHI$")>;
+def : InstRW<[WLat4, FXa, NormalGr], (instregex "MHI$")>;
+def : InstRW<[WLat4LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "MH(Y)?$")>;
+def : InstRW<[WLat5, FXa2, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[WLat5LSU, RegReadAdv, FXa2, LSU, GroupAlone],
+             (instregex "M(FY|L)?$")>;
+def : InstRW<[WLat8, RegReadAdv, FXa, LSU, NormalGr], (instregex "MGH$")>;
+def : InstRW<[WLat9, RegReadAdv, FXa2, LSU, GroupAlone], (instregex "MG$")>;
+def : InstRW<[WLat5, FXa2, GroupAlone], (instregex "MGRK$")>;
+def : InstRW<[WLat4LSU, WLat4LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "MSC$")>;
+def : InstRW<[WLat4LSU, WLat4LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "MSGC$")>;
+def : InstRW<[WLat4, WLat4, FXa, NormalGr], (instregex "MSRKC$")>;
+def : InstRW<[WLat4, WLat4, FXa, NormalGr], (instregex "MSGRKC$")>;
+
+//===----------------------------------------------------------------------===//
+// Division and remainder
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat20, FXa4, GroupAlone], (instregex "DR$")>;
+def : InstRW<[WLat30, RegReadAdv, FXa4, LSU, GroupAlone2], (instregex "D$")>;
+def : InstRW<[WLat30, FXa2, GroupAlone], (instregex "DSG(F)?R$")>;
+def : InstRW<[WLat30, RegReadAdv, FXa2, LSU, GroupAlone2],
+             (instregex "DSG(F)?$")>;
+def : InstRW<[WLat20, FXa4, GroupAlone], (instregex "DLR$")>;
+def : InstRW<[WLat30, FXa4, GroupAlone], (instregex "DLGR$")>;
+def : InstRW<[WLat30, RegReadAdv, FXa4, LSU, GroupAlone2],
+             (instregex "DL(G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Shifts
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLL(G|K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SRL(G|K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SRA(G|K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLA(G|K)?$")>;
+def : InstRW<[WLat5LSU, WLat5LSU, FXa4, LSU, GroupAlone2],
+             (instregex "S(L|R)D(A|L)$")>;
+
+// Rotate
+def : InstRW<[WLat2LSU, FXa, LSU, NormalGr], (instregex "RLL(G)?$")>;
+
+// Rotate and insert
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBH(G|H|L)(Opt)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBL(G|H|L)(Opt)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBG(N|32)?(Z)?(Opt)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBMux$")>;
+
+// Rotate and Select
+def : InstRW<[WLat2, WLat2, FXa2, Cracked], (instregex "R(N|O|X)SBG(Opt)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr],
+             (instregex "C(G|Y|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CRL$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(F|H)I(Mux)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CG(F|H)I$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CG(HSI|RL)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(G)?R$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CIH$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CHF$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CHSI$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr],
+             (instregex "CL(Y|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLFHSI$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLFI(Mux)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLG$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLG(HRL|HSI)$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLGF$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLGFRL$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLGF(I|R)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLGR$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLGRL$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLHF$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLH(RL|HSI)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLIH$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLI(Y)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLR$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLRL$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(L)?HHR$")>;
+def : InstRW<[WLat2, FXb, NormalGr], (instregex "C(L)?HLR$")>;
+
+// Compare halfword
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CH(Y)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CHRL$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CGH$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CGHRL$")>;
+def : InstRW<[WLat2LSU, FXa, FXb, LSU, Cracked], (instregex "CHHSI$")>;
+
+// Compare with sign extension (32 -> 64)
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CGF$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CGFRL$")>;
+def : InstRW<[WLat2, FXb, NormalGr], (instregex "CGFR$")>;
+
+// Compare logical character
+def : InstRW<[WLat6, FXb, LSU2, Cracked], (instregex "CLC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLCL(E|U)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLST$")>;
+
+// Test under mask
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "TM(Y)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TM(H|L)Mux$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMHH(64)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMHL(64)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMLH(64)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMLL(64)?$")>;
+
+// Compare logical characters under mask
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr],
+             (instregex "CLM(H|Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Prefetch and execution hint
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, LSU, NormalGr], (instregex "PFD(RL)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "BPP$")>;
+def : InstRW<[FXb, EndGroup], (instregex "BPRP$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "NIAI$")>;
+
+//===----------------------------------------------------------------------===//
+// Atomic operations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, EndGroup], (instregex "Serialize$")>;
+
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAA(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAAL(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAN(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAO(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAX(G)?$")>;
+
+// Test and set
+def : InstRW<[WLat2LSU, FXb, LSU, EndGroup], (instregex "TS$")>;
+
+// Compare and swap
+def : InstRW<[WLat3LSU, WLat3LSU, FXa, FXb, LSU, GroupAlone],
+             (instregex "CS(G|Y)?$")>;
+
+// Compare double and swap
+def : InstRW<[WLat6LSU, WLat6LSU, FXa3, FXb2, LSU, GroupAlone2],
+             (instregex "CDS(Y)?$")>;
+def : InstRW<[WLat15, WLat15, FXa2, FXb4, LSU3,
+              GroupAlone3], (instregex "CDSG$")>;
+
+// Compare and swap and store
+def : InstRW<[WLat30, MCD], (instregex "CSST$")>;
+
+// Perform locked operation
+def : InstRW<[WLat30, MCD], (instregex "PLO$")>;
+
+// Load/store pair from/to quadword
+def : InstRW<[WLat4LSU, LSU2, GroupAlone], (instregex "LPQ$")>;
+def : InstRW<[WLat1, FXb2, LSU, GroupAlone], (instregex "STPQ$")>;
+
+// Load pair disjoint
+def : InstRW<[WLat1LSU, WLat1LSU, LSU2, GroupAlone], (instregex "LPD(G)?$")>;
+
+// Compare and load
+def : InstRW<[WLat30, MCD], (instregex "CAL(G|GF)?$")>;
+
+// Perform functions with concurrent results
+def : InstRW<[WLat30, MCD], (instregex "PFCR$")>;
+
+//===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "TR$")>;
+def : InstRW<[WLat30, WLat30, WLat30, FXa3, LSU2, GroupAlone2],
+             (instregex "TRT$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRTR$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "TRE$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRT(R)?E(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD],
+             (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD],
+             (instregex "KM(C|F|O|CTR|A)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD],
+             (instregex "(KIMD|KLMD|KMAC|KDSA)(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD],
+             (instregex "(PCC|PPNO|PRNO)$")>;
+
+//===----------------------------------------------------------------------===//
+// Guarded storage
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LGG$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLGFSG$")>;
+def : InstRW<[WLat30, MCD], (instregex "(L|ST)GSC$")>;
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat20, RegReadAdv, FXb, VecDF2, LSU2, GroupAlone2],
+             (instregex "CVBG$")>;
+def : InstRW<[WLat20, RegReadAdv, FXb, VecDF, LSU, GroupAlone2],
+             (instregex "CVB(Y)?$")>;
+def : InstRW<[WLat1, FXb3, VecDF4, LSU, GroupAlone3], (instregex "CVDG$")>;
+def : InstRW<[WLat1, FXb2, VecDF, LSU, GroupAlone2], (instregex "CVD(Y)?$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "MV(N|O|Z)$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[WLat12, LSU5, GroupAlone], (instregex "UNPK(A|U)$")>;
+def : InstRW<[WLat1, FXb, LSU2, Cracked], (instregex "UNPK$")>;
+
+def : InstRW<[WLat5LSU, FXb, VecDFX, LSU3, GroupAlone2],
+             (instregex "(A|S|ZA)P$")>;
+def : InstRW<[WLat1, FXb, VecDFX2, LSU3, GroupAlone2], (instregex "MP$")>;
+def : InstRW<[WLat1, FXb, VecDFX4, LSU3, GroupAlone2], (instregex "DP$")>;
+def : InstRW<[WLat15, FXb, VecDFX2, LSU2, GroupAlone3], (instregex "SRP$")>;
+def : InstRW<[WLat8, VecDFX, LSU, LSU, GroupAlone], (instregex "CP$")>;
+def : InstRW<[WLat3LSU, VecDFX, LSU, Cracked], (instregex "TP$")>;
+def : InstRW<[WLat30, MCD], (instregex "ED(MK)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Access registers
+//===----------------------------------------------------------------------===//
+
+// Extract/set/copy access register
+def : InstRW<[WLat3, LSU, NormalGr], (instregex "(EAR|SAR|CPYA)$")>;
+
+// Load address extended
+def : InstRW<[WLat5, LSU, FXa, Cracked], (instregex "LAE(Y)?$")>;
+
+// Load/store access multiple (not modeled precisely)
+def : InstRW<[WLat20, WLat20, LSU5, GroupAlone], (instregex "LAM(Y)?$")>;
+def : InstRW<[WLat1, LSU5, FXb, GroupAlone2], (instregex "STAM(Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Program mask and addressing mode
+//===----------------------------------------------------------------------===//
+
+// Insert Program Mask
+def : InstRW<[WLat3, FXa, EndGroup], (instregex "IPM$")>;
+
+// Set Program Mask
+def : InstRW<[WLat3, LSU, EndGroup], (instregex "SPM$")>;
+
+// Branch and link
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "BAL(R)?$")>;
+
+// Test addressing mode
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TAM$")>;
+
+// Set addressing mode
+def : InstRW<[WLat1, FXb, EndGroup], (instregex "SAM(24|31|64)$")>;
+
+// Branch (and save) and set mode.
+def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BSM$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "BASSM$")>;
+
+//===----------------------------------------------------------------------===//
+// Transactional execution
+//===----------------------------------------------------------------------===//
+
+// Transaction begin
+def : InstRW<[WLat9, LSU2, FXb5, GroupAlone2], (instregex "TBEGIN(C)?$")>;
+
+// Transaction end
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "TEND$")>;
+
+// Transaction abort
+def : InstRW<[WLat30, MCD], (instregex "TABORT$")>;
+
+// Extract Transaction Nesting Depth
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ETND$")>;
+
+// Nontransactional store
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "NTSTG$")>;
+
+//===----------------------------------------------------------------------===//
+// Processor assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "PPA$")>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//===----------------------------------------------------------------------===//
+
+// Count leading/trailing zeros.
+def : InstRW<[WLat3, FXa, NormalGr], (instregex "C(L|T)ZG$")>;
+
+// Find leftmost one
+def : InstRW<[WLat5, WLat5, FXa2, GroupAlone], (instregex "FLOGR$")>;
+
+// Population count
+def : InstRW<[WLat3, WLat3, FXa, NormalGr], (instregex "POPCNT(Opt)?$")>;
+
+// Bit deposit and bit extract.
+def : InstRW<[WLat4, FXa, NormalGr], (instregex "(BDEPG|BEXTG)$")>;
+
+// String instructions
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "SRST(U)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CUSE$")>;
+
+// Various complex instructions
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CFC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, WLat30, WLat30, MCD],
+             (instregex "UPT$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CKSM$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CMPSC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "SORTL$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "DFLTCC$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "NNPA$")>;
+
+// Execute
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "EX(RL)?$")>;
+
+//===----------------------------------------------------------------------===//
+// .insn directive instructions
+//===----------------------------------------------------------------------===//
+
+// An "empty" sched-class will be assigned instead of the "invalid sched-class".
+// getNumDecoderSlots() will then return 1 instead of 0.
+def : InstRW<[], (instregex "Insn.*")>;
+
+
+// ----------------------------- Floating point ----------------------------- //
+
+//===----------------------------------------------------------------------===//
+// FP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load zero
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>;
+
+// Load
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>;
+def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>;
+
+// Load and Test
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)BR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "LTXBR$")>;
+
+// Copy sign
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "CPSDR(d|s)(d|s)$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Load instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "LE(Y)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LD(Y|E32)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Store instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E|D)(Y)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "LEDBR(A)?$")>;
+def : InstRW<[WLat9, VecDF2, NormalGr], (instregex "L(E|D)XBR(A)?$")>;
+
+// Load lengthened
+def : InstRW<[WLat6LSU, VecBF, LSU, NormalGr], (instregex "LDEB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "LDEBR$")>;
+def : InstRW<[WLat7LSU, VecBF4, LSU, GroupAlone], (instregex "LX(E|D)B$")>;
+def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "LX(E|D)BR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[WLat7, FXb, VecBF, Cracked], (instregex "C(E|D)(F|G)BR(A)?$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone2], (instregex "CX(F|G)BR(A)?$")>;
+def : InstRW<[WLat7, FXb, VecBF, Cracked], (instregex "C(E|D)L(F|G)BR$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone2], (instregex "CXL(F|G)BR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[WLat9, WLat9, FXb, VecBF, Cracked],
+             (instregex "C(F|G)(E|D)BR(A)?$")>;
+def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked],
+             (instregex "C(F|G)XBR(A)?$")>;
+def : InstRW<[WLat9, WLat9, FXb, VecBF, GroupAlone], (instregex "CLFEBR$")>;
+def : InstRW<[WLat9, WLat9, FXb, VecBF, Cracked], (instregex "CLFDBR$")>;
+def : InstRW<[WLat9, WLat9, FXb, VecBF, Cracked], (instregex "CLG(E|D)BR$")>;
+def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "CL(F|G)XBR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XBR$")>;
+
+// Square root
+def : InstRW<[WLat30, VecFPd, LSU, NormalGr], (instregex "SQ(E|D)B$")>;
+def : InstRW<[WLat20, VecFPd20, NormalGr], (instregex "SQEBR$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "SQDBR$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "SQXBR$")>;
+
+// Load FP integer
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "FI(E|D)BR(A)?$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXBR(A)?$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "A(E|D)B$")>;
+def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "A(E|D)BR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXBR$")>;
+
+// Subtraction
+def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "S(E|D)B$")>;
+def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "S(E|D)BR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXBR$")>;
+
+// Multiply
+def : InstRW<[WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "M(D|DE|EE)B$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "M(D|DE|EE)BR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF4, LSU, GroupAlone],
+             (instregex "MXDB$")>;
+def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "MXDBR$")>;
+def : InstRW<[WLat20, VecDF4, GroupAlone], (instregex "MXBR$")>;
+
+// Multiply and add / subtract
+def : InstRW<[WLat6LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+             (instregex "M(A|S)EB$")>;
+def : InstRW<[WLat6, VecBF, GroupAlone], (instregex "M(A|S)EBR$")>;
+def : InstRW<[WLat6LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+             (instregex "M(A|S)DB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "M(A|S)DBR$")>;
+
+// Division
+def : InstRW<[WLat20, RegReadAdv, VecFPd20, LSU, NormalGr], (instregex "DEB$")>;
+def : InstRW<[WLat30, RegReadAdv, VecFPd, LSU, NormalGr], (instregex "DDB$")>;
+def : InstRW<[WLat20, VecFPd20, NormalGr], (instregex "DEBR$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "DDBR$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "DXBR$")>;
+
+// Divide to integer
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "DI(E|D)BR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[WLat3LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
+             (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "(K|C)(E|D)BR$")>;
+def : InstRW<[WLat9, VecDF2, GroupAlone], (instregex "(K|C)XBR$")>;
+
+// Test Data Class
+def : InstRW<[WLat5, LSU, VecXsPm, NormalGr], (instregex "TC(E|D)B$")>;
+def : InstRW<[WLat10, LSU, VecDF4, GroupAlone], (instregex "TCXB$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Floating-point control register instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat4, FXa, LSU, GroupAlone], (instregex "EFPC$")>;
+def : InstRW<[WLat1, FXb, LSU, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[WLat3, LSU, GroupAlone], (instregex "SFPC$")>;
+def : InstRW<[WLat3LSU, LSU2, GroupAlone], (instregex "LFPC$")>;
+def : InstRW<[WLat30, MCD], (instregex "SFASR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LFAS$")>;
+def : InstRW<[WLat3, FXb, GroupAlone], (instregex "SRNM(B|T)?$")>;
+
+
+// --------------------- Hexadecimal floating point ------------------------- //
+
+//===----------------------------------------------------------------------===//
+// HFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "LTXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "(LEDR|LRER)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "LEXR$")>;
+def : InstRW<[WLat9, VecDF2, NormalGr], (instregex "(LDXR|LRDR)$")>;
+
+// Load lengthened
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LDE$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LDER$")>;
+def : InstRW<[WLat7LSU, VecBF4, LSU, GroupAlone], (instregex "LX(E|D)$")>;
+def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "LX(E|D)R$")>;
+
+// Convert from fixed
+def : InstRW<[WLat7, FXb, VecBF, Cracked], (instregex "C(E|D)(F|G)R$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone2], (instregex "CX(F|G)R$")>;
+
+// Convert to fixed
+def : InstRW<[WLat9, WLat9, FXb, VecBF, Cracked], (instregex "C(F|G)(E|D)R$")>;
+def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "C(F|G)XR$")>;
+
+// Convert BFP to HFP / HFP to BFP.
+def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "THD(E)?R$")>;
+def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "TB(E)?DR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XR$")>;
+
+// Halve
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "H(E|D)R$")>;
+
+// Square root
+def : InstRW<[WLat30, VecFPd, LSU, NormalGr], (instregex "SQ(E|D)$")>;
+def : InstRW<[WLat20, VecFPd20, NormalGr], (instregex "SQER$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "SQDR$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "SQXR$")>;
+
+// Load FP integer
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "FI(E|D)R$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "A(E|D|U|W)$")>;
+def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "A(E|D|U|W)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXR$")>;
+
+// Subtraction
+def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "S(E|D|U|W)$")>;
+def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "S(E|D|U|W)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXR$")>;
+
+// Multiply
+def : InstRW<[WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "M(D|DE|E|EE)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "M(D|DE|E|EE)R$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF4, LSU, GroupAlone],
+             (instregex "MXD$")>;
+def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "MXDR$")>;
+def : InstRW<[WLat20, VecDF4, GroupAlone], (instregex "MXR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF4, LSU, GroupAlone], (instregex "MY$")>;
+def : InstRW<[WLat6LSU, RegReadAdv, VecBF2, LSU, GroupAlone],
+             (instregex "MY(H|L)$")>;
+def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "MYR$")>;
+def : InstRW<[WLat6, VecBF, GroupAlone], (instregex "MY(H|L)R$")>;
+
+// Multiply and add / subtract
+def : InstRW<[WLat6LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+             (instregex "M(A|S)(E|D)$")>;
+def : InstRW<[WLat6, VecBF, GroupAlone], (instregex "M(A|S)(E|D)R$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, VecBF4, LSU, GroupAlone],
+             (instregex "MAY$")>;
+def : InstRW<[WLat6LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+             (instregex "MAY(H|L)$")>;
+def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "MAYR$")>;
+def : InstRW<[WLat6, VecBF, GroupAlone], (instregex "MAY(H|L)R$")>;
+
+// Division
+def : InstRW<[WLat20, RegReadAdv, VecFPd20, LSU, NormalGr], (instregex "DE$")>;
+def : InstRW<[WLat30, RegReadAdv, VecFPd, LSU, NormalGr], (instregex "DD$")>;
+def : InstRW<[WLat20, VecFPd20, NormalGr], (instregex "DER$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "DDR$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "DXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "C(E|D)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "C(E|D)R$")>;
+def : InstRW<[WLat10, VecDF2, GroupAlone], (instregex "CXR$")>;
+
+
+// ------------------------ Decimal floating point -------------------------- //
+
+//===----------------------------------------------------------------------===//
+// DFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "LTDTR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "LTXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[WLat15, VecDF, NormalGr], (instregex "LEDTR$")>;
+def : InstRW<[WLat15, VecDF2, NormalGr], (instregex "LDXTR$")>;
+
+// Load lengthened
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "LDETR$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "LXDTR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[WLat15, FXb, VecDF, Cracked], (instregex "CDFTR(A)?$")>;
+def : InstRW<[WLat20, FXb, VecDF, Cracked], (instregex "CDGTR(A)?$")>;
+def : InstRW<[WLat15, FXb, VecDF4, GroupAlone2], (instregex "CXFTR(A)?$")>;
+def : InstRW<[WLat20, FXb, VecDF4, GroupAlone2], (instregex "CXGTR(A)?$")>;
+def : InstRW<[WLat15, FXb, VecDF, Cracked], (instregex "CDLFTR$")>;
+def : InstRW<[WLat20, FXb, VecDF, Cracked], (instregex "CDLGTR$")>;
+def : InstRW<[WLat15, FXb, VecDF4, GroupAlone2], (instregex "CXLFTR$")>;
+def : InstRW<[WLat20, FXb, VecDF4, GroupAlone2], (instregex "CXLGTR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[WLat20, WLat20, FXb, VecDF, Cracked],
+             (instregex "C(F|G)DTR(A)?$")>;
+def : InstRW<[WLat20, WLat20, FXb, VecDF2, Cracked],
+             (instregex "C(F|G)XTR(A)?$")>;
+def : InstRW<[WLat20, WLat20, FXb, VecDF, Cracked], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[WLat20, WLat20, FXb, VecDF2, Cracked], (instregex "CL(F|G)XTR$")>;
+
+// Convert from / to signed / unsigned packed
+def : InstRW<[WLat9, FXb, VecDF, Cracked], (instregex "CD(S|U)TR$")>;
+def : InstRW<[WLat12, FXb2, VecDF4, GroupAlone2], (instregex "CX(S|U)TR$")>;
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "C(S|U)DTR$")>;
+def : InstRW<[WLat15, FXb2, VecDF4, GroupAlone2], (instregex "C(S|U)XTR$")>;
+
+// Convert from / to zoned
+def : InstRW<[WLat8LSU, LSU, VecDF, Cracked], (instregex "CDZT$")>;
+def : InstRW<[WLat16LSU, LSU2, VecDF4, GroupAlone3], (instregex "CXZT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF, Cracked], (instregex "CZDT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF2, GroupAlone], (instregex "CZXT$")>;
+
+// Convert from / to packed
+def : InstRW<[WLat8LSU, LSU, VecDF, Cracked], (instregex "CDPT$")>;
+def : InstRW<[WLat16LSU, LSU2, VecDF4, GroupAlone3], (instregex "CXPT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF, Cracked], (instregex "CPDT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF2, GroupAlone], (instregex "CPXT$")>;
+
+// Perform floating-point operation
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "PFPO$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load FP integer
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "FIDTR$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXTR$")>;
+
+// Extract biased exponent
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "EEDTR$")>;
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "EEXTR$")>;
+
+// Extract significance
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "ESDTR$")>;
+def : InstRW<[WLat12, FXb, VecDF2, Cracked], (instregex "ESXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "ADTR(A)?$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXTR(A)?$")>;
+
+// Subtraction
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "SDTR(A)?$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXTR(A)?$")>;
+
+// Multiply
+def : InstRW<[WLat20, VecDF, NormalGr], (instregex "MDTR(A)?$")>;
+def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "MXTR(A)?$")>;
+
+// Division
+def : InstRW<[WLat30, VecDF, NormalGr], (instregex "DDTR(A)?$")>;
+def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "DXTR(A)?$")>;
+
+// Quantize
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "QADTR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "QAXTR$")>;
+
+// Reround
+def : InstRW<[WLat9, WLat9, FXb, VecDF, Cracked], (instregex "RRDTR$")>;
+def : InstRW<[WLat11, WLat11, FXb, VecDF4, GroupAlone2], (instregex "RRXTR$")>;
+
+// Shift significand left/right
+def : InstRW<[WLat11LSU, LSU, VecDF, GroupAlone], (instregex "S(L|R)DT$")>;
+def : InstRW<[WLat11LSU, LSU, VecDF4, GroupAlone], (instregex "S(L|R)XT$")>;
+
+// Insert biased exponent
+def : InstRW<[WLat9, FXb, VecDF, Cracked], (instregex "IEDTR$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone2], (instregex "IEXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "(K|C)DTR$")>;
+def : InstRW<[WLat9, VecDF2, GroupAlone], (instregex "(K|C)XTR$")>;
+
+// Compare biased exponent
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "CEDTR$")>;
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "CEXTR$")>;
+
+// Test Data Class/Group
+def : InstRW<[WLat15, LSU, VecDF, NormalGr], (instregex "TD(C|G)(E|D)T$")>;
+def : InstRW<[WLat15, LSU, VecDF2, GroupAlone], (instregex "TD(C|G)XT$")>;
+
+
+// --------------------------------- Vector --------------------------------- //
+
+//===----------------------------------------------------------------------===//
+// Vector: Move instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "VLR(32|64)?$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "VLGV(B|F|G|H)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "VLVG(B|F|G|H)?$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "VLVGP(32)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Immediate instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VZERO$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VONE$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VGBM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VGM(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VREPI(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLEI(B|F|G|H)$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Loads
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(Align)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(L|BB)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(32|64)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEZ(B|F|G|H|LF)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLREP(B|F|G|H)?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
+             (instregex "VLE(B|F|G|H)$")>;
+def : InstRW<[WLat5LSU, RegReadAdv, FXb, LSU, VecXsPm, Cracked],
+             (instregex "VGE(F|G)$")>;
+def : InstRW<[WLat4LSU, WLat4LSU, LSU5, GroupAlone],
+             (instregex "VLM(Align)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLRL(R)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Stores
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|32|64)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>;
+def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>;
+def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM(Align)?$")>;
+def : InstRW<[WLat1, FXb2, LSU, Cracked], (instregex "VSCE(F|G)$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTRL(R)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Byte swaps
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLBR(H|F|G|Q)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLER(H|F|G)?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
+             (instregex "VLEBR(H|F|G)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEBRZ(H|F|G|E)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLBRREP(H|F|G)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTBR(H|F|G|Q)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTER(H|F|G)?$")>;
+def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTEBRH$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTEBR(F|G)$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Selects and permutes
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMRH(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMRL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPERM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPDI$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VBPERM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VREP(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSEL$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VBLEND(B|F|G|H|Q)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Widening and narrowing
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPK(F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPKS(F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VPKS(F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPKLS(F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VPKLS(F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSEG(B|F|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VGEM(B|H|F|G|Q)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPH(B|F|H|G)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPL(B|F|G)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPLH(B|F|H|G|W)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPLL(B|F|H|G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Integer arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VA(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VACC(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VAVG(B|F|G|H|Q)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VAVGL(B|F|G|H|Q)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VN(C|O|N|X)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VO(C)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VCKSM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCLZ(B|F|G|H|Q)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCTZ(B|F|G|H|Q)?$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VD(L)?(F|G|Q)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VEVAL$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VX$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFM?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFMA(B|F|G|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFM(B|F|G|H)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLC(B|F|G|H|Q)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLP(B|F|G|H|Q)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMX(B|F|G|H|Q)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMXL(B|F|G|H|Q)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMN(B|F|G|H|Q)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMNL(B|F|G|H|Q)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAL(B|F|G|Q)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALE(B|F|H|G)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALH(B|F|H|G|Q|W)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALO(B|F|H|G)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAO(B|F|H|G)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAE(B|F|H|G)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAH(B|F|H|G|Q)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VME(B|F|H|G)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMH(B|F|H|G|Q)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VML(B|F|G|Q)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLE(B|F|H|G)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLH(B|F|H|G|Q|W)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLO(B|F|H|G)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMO(B|F|H|G)?$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VMSL(G)?$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPOPCT(B|F|G|H)?$")>;
+
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VR(L)?(F|G|Q)?$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERLL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERLLV(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERIM(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESLV(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRA(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRAV(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRLV(B|F|G|H)?$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSL(DB)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSLB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSR(A|L)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSR(A|L)B$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSLD$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSRD$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSB(I|IQ|CBI|CBIQ)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSCBI(B|F|G|H|Q)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VS(F|G|H|Q)?$")>;
+
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUM(B|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUMG(F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUMQ(F|G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Integer comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "VEC(B|F|G|H|Q)?$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "VECL(B|F|G|H|Q)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCEQ(B|F|G|H|Q)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCEQ(B|F|G|H|Q)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCH(B|F|G|H|Q)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCH(B|F|G|H|Q)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCHL(B|F|G|H|Q)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCHL(B|F|G|H|Q)S$")>;
+def : InstRW<[WLat4, VecStr, NormalGr], (instregex "VTM$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point arithmetic
+//===----------------------------------------------------------------------===//
+
+// Conversion and rounding
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VCFP(S|L)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VCD(L)?G$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VCD(L)?GB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WCD(L)?GB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VCE(L)?FB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WCE(L)?FB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VC(S|L)FP$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VC(L)?GD$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VC(L)?GDB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WC(L)?GDB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VC(L)?FEB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WC(L)?FEB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VL(DE|ED)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VL(DE|ED)B$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WL(DE|ED)B$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFL(L|R)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFL(LS|RD)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFL(LS|RD)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFLLD$")>;
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "WFLRX$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFI(DB)?$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFIDB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFISB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFISB$")>;
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "WFIXB$")>;
+
+// Sign operations
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VFPSO$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FPSODB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FPSOSB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFPSOXB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FL(C|N|P)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FL(C|N|P)SB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFL(C|N|P)XB$")>;
+
+// Minimum / maximum
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(MAX|MIN)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(MAX|MIN)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WF(MAX|MIN)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(MAX|MIN)SB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WF(MAX|MIN)SB$")>;
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "WF(MAX|MIN)XB$")>;
+
+// Test data class
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFTCI$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "(V|W)FTCIDB$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "(V|W)FTCISB$")>;
+def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFTCIXB$")>;
+
+// Add / subtract
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)DB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)DB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)SB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)SB$")>;
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "WF(A|S)XB$")>;
+
+// Multiply / multiply-and-add/subtract
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFM(DB)?$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFM(D|S)B$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFMSB$")>;
+def : InstRW<[WLat20, VecDF, NormalGr], (instregex "WFMXB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)DB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)DB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)SB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)SB$")>;
+def : InstRW<[WLat20, VecDF, NormalGr], (instregex "WF(N)?M(A|S)XB$")>;
+
+// Divide / square root
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VFD$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FDDB$")>;
+def : InstRW<[WLat20, VecFPd20, NormalGr], (instregex "WFDSB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VFDSB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "WFDXB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VFSQ$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FSQDB$")>;
+def : InstRW<[WLat20, VecFPd20, NormalGr], (instregex "WFSQSB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VFSQSB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "WFSQXB$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(C|K)(E|H|HE)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(C|K)(E|H|HE)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFK(E|H|HE)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(C|K)(E|H|HE)SB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)SB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFK(E|H|HE)SB$")>;
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "WFC(E|H|HE)XB$")>;
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "WFK(E|H|HE)XB$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFC(E|H|HE)DBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFK(E|H|HE)DBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr],
+             (instregex "WF(C|K)(E|H|HE)DBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr],
+             (instregex "VF(C|K)(E|H|HE)SBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)SBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "WFK(E|H|HE)SBS$")>;
+def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFC(E|H|HE)XBS$")>;
+def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFK(E|H|HE)XBS$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)DB$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)SB$")>;
+def : InstRW<[WLat3, VecDFX, NormalGr], (instregex "WF(C|K)XB$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point insertion and extraction
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "LFER$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: String instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAE(B)?$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAE(F|H)$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VFAE(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAEZ(B|F|H)$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VFAEZ(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFEE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr],
+             (instregex "VFEE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFENE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr],
+             (instregex "VFENE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VISTR(B|F|H)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VISTR(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VSTRC(B|F|H)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRC(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VSTRCZ(B|F|H)$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRCZ(B|F|H)S$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRS(B|F|H)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRSZ(B|F|H)$")>;
+
+//===----------------------------------------------------------------------===//
+// NNP assist instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCFN$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCLFN(L|H)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VC(R)?NF$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Packed-decimal instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "VLIP$")>;
+def : InstRW<[WLat6, VecDFX, LSU, GroupAlone2], (instregex "VPKZ$")>;
+def : InstRW<[WLat1, VecDFX, FXb, LSU2, GroupAlone2], (instregex "VUPKZ$")>;
+def : InstRW<[WLat20, WLat20, VecDF, FXb, GroupAlone],
+             (instregex "VCVB(G|Q)?(Opt)?$")>;
+def : InstRW<[WLat15, WLat15, VecDF, FXb, GroupAlone],
+             (instregex "VCVD(G|Q)?$")>;
+def : InstRW<[WLat4, WLat4, VecDFX, NormalGr], (instregex "V(A|S)P$")>;
+def : InstRW<[WLat30, WLat30, VecDF, GroupAlone], (instregex "VM(S)?P$")>;
+def : InstRW<[WLat30, WLat30, VecDF, GroupAlone], (instregex "V(D|R)P$")>;
+def : InstRW<[WLat30, WLat30, VecDF, GroupAlone], (instregex "VSDP$")>;
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "VSRP(R)?$")>;
+def : InstRW<[WLat4, WLat4, VecDFX, NormalGr], (instregex "VPSOP$")>;
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "V(T|C)(P|Z)(Opt)?$")>;
+
+def : InstRW<[WLat20, VecDF, NormalGr], (instregex "VSCH(S|D|X)?P$")>;
+def : InstRW<[WLat30, VecDF, NormalGr], (instregex "VSCSHP$")>;
+def : InstRW<[WLat30, VecDF, NormalGr], (instregex "VCSPH$")>;
+def : InstRW<[WLat2, WLat2, VecXsPm, NormalGr], (instregex "VCLZDP$")>;
+def : InstRW<[WLat2, WLat2, VecDFX, NormalGr], (instregex "VPKZR$")>;
+def : InstRW<[WLat2, WLat2, VecDFX, NormalGr], (instregex "VUPKZH$")>;
+def : InstRW<[WLat2, WLat2, VecDFX, NormalGr], (instregex "VUPKZL$")>;
+
+// -------------------------------- System ---------------------------------- //
+
+//===----------------------------------------------------------------------===//
+// System: Program-Status Word Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, WLat30, MCD], (instregex "EPSW$")>;
+def : InstRW<[WLat20, GroupAlone3], (instregex "LPSW(E)?(Y)?$")>;
+def : InstRW<[WLat3, FXa, GroupAlone], (instregex "IPK$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SPKA$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SSM$")>;
+def : InstRW<[WLat1, FXb, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
+def : InstRW<[WLat3, FXa, NormalGr], (instregex "IAC$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SAC(F)?$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Control Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat4LSU, WLat4LSU, LSU2, GroupAlone], (instregex "LCTL(G)?$")>;
+def : InstRW<[WLat1, LSU5, FXb, GroupAlone2], (instregex "STCT(L|G)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "E(P|S)A(I)?R$")>;
+def : InstRW<[WLat30, MCD], (instregex "SSA(I)?R$")>;
+def : InstRW<[WLat30, MCD], (instregex "ESEA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Prefix-Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "S(T)?PX$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Breaking-Event-Address-Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat3LSU, LSU2, GroupAlone], (instregex "LBEAR$")>;
+def : InstRW<[WLat1, LSU2, FXb, GroupAlone], (instregex "STBEAR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Storage-Key and Real Memory Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "ISKE$")>;
+def : InstRW<[WLat30, MCD], (instregex "IVSK$")>;
+def : InstRW<[WLat30, MCD], (instregex "SSKE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "RRB(E|M)$")>;
+def : InstRW<[WLat30, MCD], (instregex "IRBM$")>;
+def : InstRW<[WLat30, MCD], (instregex "PFMF$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "TB$")>;
+def : InstRW<[WLat30, MCD], (instregex "PGIN$")>;
+def : InstRW<[WLat30, MCD], (instregex "PGOUT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Dynamic-Address-Translation Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "IPTE(Opt)?(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "IDTE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "RDP(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "CRDTE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTLB$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "CSP(G)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "LPTEA$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "LRA(Y|G)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "STRAG$")>;
+def : InstRW<[WLat30, MCD], (instregex "LURA(G)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "STUR(A|G)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TPROT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Memory-move Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat4LSU, FXa2, FXb, LSU5, GroupAlone2], (instregex "MVC(K|P|S)$")>;
+def : InstRW<[WLat1, FXa, LSU5, GroupAlone2], (instregex "MVC(S|D)K$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVCOS$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVPG$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Address-Space Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "LASP$")>;
+def : InstRW<[WLat1, LSU, GroupAlone], (instregex "PALB$")>;
+def : InstRW<[WLat30, MCD], (instregex "PC$")>;
+def : InstRW<[WLat30, MCD], (instregex "PR$")>;
+def : InstRW<[WLat30, MCD], (instregex "PT(I)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "RP$")>;
+def : InstRW<[WLat30, MCD], (instregex "BS(G|A)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TAR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Linkage-Stack Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "BAKR$")>;
+def : InstRW<[WLat30, MCD], (instregex "EREG(G)?$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "(E|M)STA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Time-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "PTFF$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCK(PF|C)?$")>;
+def : InstRW<[WLat1, LSU2, GroupAlone], (instregex "SPT$")>;
+def : InstRW<[WLat15, LSU3, FXa2, FXb, GroupAlone2], (instregex "STCK(F)?$")>;
+def : InstRW<[WLat20, LSU4, FXa2, FXb2, GroupAlone3], (instregex "STCKE$")>;
+def : InstRW<[WLat30, MCD], (instregex "STCKC$")>;
+def : InstRW<[WLat1, LSU2, FXb, Cracked], (instregex "STPT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "STAP$")>;
+def : InstRW<[WLat30, MCD], (instregex "STIDP$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "STSI$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "STFL(E)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "ECAG$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "ECTG$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTF$")>;
+def : InstRW<[WLat30, MCD], (instregex "PCKMO$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "QPACI$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Miscellaneous Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "SVC$")>;
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "MC$")>;
+def : InstRW<[WLat30, MCD], (instregex "DIAG$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TRACE$")>;
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "TRACG$")>;
+def : InstRW<[WLat30, MCD], (instregex "TRAP(2|4)$")>;
+def : InstRW<[WLat30, MCD], (instregex "SIG(P|A)$")>;
+def : InstRW<[WLat30, MCD], (instregex "SIE$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Measurement Facility Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LPP$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "ECPGA$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "E(C|P)CTR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LCCTL$")>;
+def : InstRW<[WLat30, MCD], (instregex "L(P|S)CTL$")>;
+def : InstRW<[WLat30, MCD], (instregex "Q(S|CTR)I$")>;
+def : InstRW<[WLat30, MCD], (instregex "S(C|P)CTR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: I/O Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "(C|H|R|X)SCH$")>;
+def : InstRW<[WLat30, MCD], (instregex "(M|S|ST|T)SCH$")>;
+def : InstRW<[WLat30, MCD], (instregex "RCHP$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCHM$")>;
+def : InstRW<[WLat30, MCD], (instregex "STC(PS|RW)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TPE?I$")>;
+def : InstRW<[WLat30, MCD], (instregex "SAL$")>;
+
+//===----------------------------------------------------------------------===//
+// NOPs
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "NOP(R)?(Opt)?$")>;
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "J(G)?NOP$")>;
+}
+
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
index 17889dacc868c..31a33c1e7365b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
@@ -64,8 +64,8 @@ def I32 : WebAssemblyRegClass<[i32], 32, (add FP32, SP32, I32_0)>;
 def I64 : WebAssemblyRegClass<[i64], 64, (add FP64, SP64, I64_0)>;
 def F32 : WebAssemblyRegClass<[f32], 32, (add F32_0)>;
 def F64 : WebAssemblyRegClass<[f64], 64, (add F64_0)>;
-def V128 : WebAssemblyRegClass<[v8f16, v4f32, v2f64, v2i64, v4i32, v16i8,
-                                v8i16],
+def V128 : WebAssemblyRegClass<[v2i64, v4i32, v16i8, v8i16,
+                                v8f16, v4f32, v2f64],
                                128, (add V128_0)>;
 def FUNCREF : WebAssemblyRegClass<[funcref], 0, (add FUNCREF_0)>;
 def EXTERNREF : WebAssemblyRegClass<[externref], 0, (add EXTERNREF_0)>;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 627cef9ead7ff..12c40b501f627 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -20889,7 +20889,8 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
     return SDValue();
 
   unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
-  if (Flags.hasNoSignedWrap() || MinSignBits < NumSignBits) {
+  if ((Flags.hasNoSignedWrap() && DstSVT != MVT::i32) ||
+      MinSignBits < NumSignBits) {
     PackOpcode = X86ISD::PACKSS;
     return In;
   }
@@ -54147,12 +54148,19 @@ SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
     if (!Flags.hasNoSignedZeros())
       break;
 
+    // Because getCheaperNegatedExpression can delete nodes we need a handle to
+    // keep temporary nodes alive.
+    std::list<HandleSDNode> Handles;
+
     // This is always negatible for free but we might be able to remove some
     // extra operand negations as well.
     SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
-    for (int i = 0; i != 3; ++i)
+    for (int i = 0; i != 3; ++i) {
       NewOps[i] = getCheaperNegatedExpression(
           Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
+      if (!!NewOps[i])
+        Handles.emplace_back(NewOps[i]);
+    }
 
     bool NegA = !!NewOps[0];
     bool NegB = !!NewOps[1];
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index fa57ae183bb84..4c60698a63eff 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -428,7 +428,7 @@ StringRef getCPUNameFromS390Model(unsigned int Id, bool HaveVectorSupport) {
     case 9175:
     case 9176:
     default:
-      return HaveVectorSupport? "arch15" : "zEC12";
+      return HaveVectorSupport? "z17" : "zEC12";
   }
 }
 } // end anonymous namespace
diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp
index c78d60fd86b3f..64ec411cb06e1 100644
--- a/llvm/lib/TargetParser/RISCVISAInfo.cpp
+++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp
@@ -45,9 +45,8 @@ struct RISCVProfile {
 
 } // end anonymous namespace
 
-static const char *RISCVGImplications[] = {
-  "i", "m", "a", "f", "d", "zicsr", "zifencei"
-};
+static const char *RISCVGImplications[] = {"i", "m", "a", "f", "d"};
+static const char *RISCVGImplicationsZi[] = {"zicsr", "zifencei"};
 
 #define GET_SUPPORTED_EXTENSIONS
 #include "llvm/TargetParser/RISCVTargetParserDef.inc"
@@ -718,6 +717,19 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
     } while (!Ext.empty());
   }
 
+  // We add Zicsr/Zifenci as final to allow duplicated "zicsr"/"zifencei" like
+  // "rv64g_zicsr_zifencei".
+  if (Baseline == 'g') {
+    for (const char *Ext : RISCVGImplicationsZi) {
+      if (ISAInfo->Exts.count(Ext))
+        continue;
+
+      auto Version = findDefaultVersion(Ext);
+      assert(Version && "Default extension version not found?");
+      ISAInfo->Exts[std::string(Ext)] = {Version->Major, Version->Minor};
+    }
+  }
+
   return RISCVISAInfo::postProcessAndChecking(std::move(ISAInfo));
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 6860a7cd07b78..118d2d4be828f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -3029,10 +3029,18 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
     SmallVector<BitCastInst *, 8> BCs;
     DenseMap<Type *, Value *> NewBCs;
     for (User *U : SVI.users())
-      if (BitCastInst *BC = dyn_cast<BitCastInst>(U))
-        if (!BC->use_empty())
-          // Only visit bitcasts that weren't previously handled.
-          BCs.push_back(BC);
+      if (BitCastInst *BC = dyn_cast<BitCastInst>(U)) {
+        // Only visit bitcasts that weren't previously handled.
+        if (BC->use_empty())
+          continue;
+        // Prefer to combine bitcasts of bitcasts before attempting this fold.
+        if (BC->hasOneUse()) {
+          auto *BC2 = dyn_cast<BitCastInst>(BC->user_back());
+          if (BC2 && isEliminableCastPair(BC, BC2))
+            continue;
+        }
+        BCs.push_back(BC);
+      }
     for (BitCastInst *BC : BCs) {
       unsigned BegIdx = Mask.front();
       Type *TgtTy = BC->getDestTy();
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index a64c188575e6c..0f5e867877da2 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1697,6 +1697,15 @@ Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op, SelectInst *SI,
   if (SI->getType()->isIntOrIntVectorTy(1))
     return nullptr;
 
+  // Avoid breaking min/max reduction pattern,
+  // which is necessary for vectorization later.
+  if (isa<MinMaxIntrinsic>(&Op))
+    for (Value *IntrinOp : Op.operands())
+      if (auto *PN = dyn_cast<PHINode>(IntrinOp))
+        for (Value *PhiOp : PN->operands())
+          if (PhiOp == &Op)
+            return nullptr;
+
   // Test if a FCmpInst instruction is used exclusively by a select as
   // part of a minimum or maximum operation. If so, refrain from doing
   // any other folding. This helps out other analyses which understand
diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 8e74b8645fad9..86c4170b9a977 100644
--- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -370,15 +370,30 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI,
   { // Scope for SwitchInstProfUpdateWrapper. It must not live during
     // ConstantFoldTerminator() as the underlying SwitchInst can be changed.
     SwitchInstProfUpdateWrapper SI(*I);
+    ConstantRange CR =
+        LVI->getConstantRangeAtUse(I->getOperandUse(0), /*UndefAllowed=*/false);
     unsigned ReachableCaseCount = 0;
 
     for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) {
       ConstantInt *Case = CI->getCaseValue();
-      auto *Res = dyn_cast_or_null<ConstantInt>(
-          LVI->getPredicateAt(CmpInst::ICMP_EQ, Cond, Case, I,
-                              /* UseBlockValue */ true));
+      std::optional<bool> Predicate = std::nullopt;
+      if (!CR.contains(Case->getValue()))
+        Predicate = false;
+      else if (CR.isSingleElement() &&
+               *CR.getSingleElement() == Case->getValue())
+        Predicate = true;
+      if (!Predicate) {
+        // Handle missing cases, e.g., the range has a hole.
+        auto *Res = dyn_cast_or_null<ConstantInt>(
+            LVI->getPredicateAt(CmpInst::ICMP_EQ, Cond, Case, I,
+                                /* UseBlockValue=*/true));
+        if (Res && Res->isZero())
+          Predicate = false;
+        else if (Res && Res->isOne())
+          Predicate = true;
+      }
 
-      if (Res && Res->isZero()) {
+      if (Predicate && !*Predicate) {
         // This case never fires - remove it.
         BasicBlock *Succ = CI->getCaseSuccessor();
         Succ->removePredecessor(BB);
@@ -395,7 +410,7 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI,
           DTU.applyUpdatesPermissive({{DominatorTree::Delete, BB, Succ}});
         continue;
       }
-      if (Res && Res->isOne()) {
+      if (Predicate && *Predicate) {
         // This case always fires.  Arrange for the switch to be turned into an
         // unconditional branch by replacing the switch condition with the case
         // value.
@@ -410,28 +425,24 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI,
       ++ReachableCaseCount;
     }
 
-    BasicBlock *DefaultDest = SI->getDefaultDest();
-    if (ReachableCaseCount > 1 &&
-        !isa<UnreachableInst>(DefaultDest->getFirstNonPHIOrDbg())) {
-      ConstantRange CR = LVI->getConstantRangeAtUse(I->getOperandUse(0),
-                                                    /*UndefAllowed*/ false);
-      // The default dest is unreachable if all cases are covered.
-      if (!CR.isSizeLargerThan(ReachableCaseCount)) {
-        BasicBlock *NewUnreachableBB =
-            BasicBlock::Create(BB->getContext(), "default.unreachable",
-                               BB->getParent(), DefaultDest);
-        new UnreachableInst(BB->getContext(), NewUnreachableBB);
+    // The default dest is unreachable if all cases are covered.
+    if (!SI->defaultDestUndefined() &&
+        !CR.isSizeLargerThan(ReachableCaseCount)) {
+      BasicBlock *DefaultDest = SI->getDefaultDest();
+      BasicBlock *NewUnreachableBB =
+          BasicBlock::Create(BB->getContext(), "default.unreachable",
+                             BB->getParent(), DefaultDest);
+      new UnreachableInst(BB->getContext(), NewUnreachableBB);
 
-        DefaultDest->removePredecessor(BB);
-        SI->setDefaultDest(NewUnreachableBB);
+      DefaultDest->removePredecessor(BB);
+      SI->setDefaultDest(NewUnreachableBB);
 
-        if (SuccessorsCount[DefaultDest] == 1)
-          DTU.applyUpdates({{DominatorTree::Delete, BB, DefaultDest}});
-        DTU.applyUpdates({{DominatorTree::Insert, BB, NewUnreachableBB}});
+      if (SuccessorsCount[DefaultDest] == 1)
+        DTU.applyUpdates({{DominatorTree::Delete, BB, DefaultDest}});
+      DTU.applyUpdates({{DominatorTree::Insert, BB, NewUnreachableBB}});
 
-        ++NumDeadCases;
-        Changed = true;
-      }
+      ++NumDeadCases;
+      Changed = true;
     }
   }
 
diff --git a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
index 2700b4307308c..fe1fe391f5982 100644
--- a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
+++ b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
+#include "llvm/TargetParser/Triple.h"
 
 using namespace llvm;
 
@@ -66,6 +67,20 @@ static bool shouldConvertToRelLookupTable(Module &M, GlobalVariable &GV) {
   if (!ElemType->isPointerTy() || DL.getPointerTypeSizeInBits(ElemType) != 64)
     return false;
 
+  SmallVector<GlobalVariable *, 4> GVOps;
+  Triple TT(M.getTargetTriple());
+  // FIXME: This should be removed in the future.
+  bool ShouldDropUnnamedAddr =
+      // Drop unnamed_addr to avoid matching pattern in
+      // `handleIndirectSymViaGOTPCRel`, which generates GOTPCREL relocations
+      // not supported by the GNU linker and LLD versions below 18 on aarch64.
+      TT.isAArch64()
+      // Apple's ld64 (and ld-prime on Xcode 15.2) miscompile something on
+      // x86_64-apple-darwin. See
+      // https://github.com/rust-lang/rust/issues/140686 and
+      // https://github.com/rust-lang/rust/issues/141306.
+      || (TT.isX86() && TT.isOSDarwin());
+
   for (const Use &Op : Array->operands()) {
     Constant *ConstOp = cast<Constant>(&Op);
     GlobalValue *GVOp;
@@ -85,8 +100,15 @@ static bool shouldConvertToRelLookupTable(Module &M, GlobalVariable &GV) {
         !GlovalVarOp->isDSOLocal() ||
         !GlovalVarOp->isImplicitDSOLocal())
       return false;
+
+    if (ShouldDropUnnamedAddr)
+      GVOps.push_back(GlovalVarOp);
   }
 
+  if (ShouldDropUnnamedAddr)
+    for (auto *GVOp : GVOps)
+      GVOp->setUnnamedAddr(GlobalValue::UnnamedAddr::None);
+
   return true;
 }
 
diff --git a/llvm/test/Analysis/BasicAA/size-overflow.ll b/llvm/test/Analysis/BasicAA/size-overflow.ll
new file mode 100644
index 0000000000000..2a390d29e472a
--- /dev/null
+++ b/llvm/test/Analysis/BasicAA/size-overflow.ll
@@ -0,0 +1,14 @@
+; RUN: opt -passes=aa-eval -print-all-alias-modref-info -disable-output < %s 2>&1 | FileCheck %s
+
+target datalayout = "p:32:32"
+
+; Make sure that using a LocationSize larget than the index space does not
+; assert.
+
+; CHECK: Just Mod:  Ptr: i32* %gep	<->  call void @llvm.memset.p0.i64(ptr %p, i8 0, i64 4294967296, i1 false)
+define void @test(ptr %p, i32 %idx) {
+  %gep = getelementptr i8, ptr %p, i32 %idx
+  load i32, ptr %gep
+  call void @llvm.memset.i64(ptr %p, i8 0, i64 u0x100000000, i1 false)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/SystemZ/divrem-reg.ll b/llvm/test/Analysis/CostModel/SystemZ/divrem-reg.ll
index 2f13d7e3ef9b1..68ffe5759e135 100644
--- a/llvm/test/Analysis/CostModel/SystemZ/divrem-reg.ll
+++ b/llvm/test/Analysis/CostModel/SystemZ/divrem-reg.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s --check-prefixes=CHECK,Z13
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=arch15 | FileCheck %s --check-prefixes=CHECK,ARC15
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z17 | FileCheck %s --check-prefixes=CHECK,Z17
 
 ; Check costs of divisions by register
 ;
@@ -52,9 +52,9 @@ define <2 x i64> @fun4(<2 x i64> %a, <2 x i64> %b) {
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r = sdiv <2 x i64> %a, %b
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %r
 ;
-; ARC15-LABEL: 'fun4'
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = sdiv <2 x i64> %a, %b
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %r
+; Z17-LABEL: 'fun4'
+; Z17-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = sdiv <2 x i64> %a, %b
+; Z17-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %r
 ;
   %r = sdiv <2 x i64> %a, %b
   ret <2 x i64> %r
@@ -65,9 +65,9 @@ define <4 x i32> @fun5(<4 x i32> %a, <4 x i32> %b) {
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %r = sdiv <4 x i32> %a, %b
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %r
 ;
-; ARC15-LABEL: 'fun5'
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = sdiv <4 x i32> %a, %b
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %r
+; Z17-LABEL: 'fun5'
+; Z17-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = sdiv <4 x i32> %a, %b
+; Z17-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %r
 ;
   %r = sdiv <4 x i32> %a, %b
   ret <4 x i32> %r
@@ -78,9 +78,9 @@ define <2 x i32> @fun6(<2 x i32> %a, <2 x i32> %b) {
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r = sdiv <2 x i32> %a, %b
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %r
 ;
-; ARC15-LABEL: 'fun6'
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = sdiv <2 x i32> %a, %b
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %r
+; Z17-LABEL: 'fun6'
+; Z17-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = sdiv <2 x i32> %a, %b
+; Z17-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %r
 ;
   %r = sdiv <2 x i32> %a, %b
   ret <2 x i32> %r
@@ -167,9 +167,9 @@ define <2 x i64> @fun15(<2 x i64> %a, <2 x i64> %b) {
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r = udiv <2 x i64> %a, %b
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %r
 ;
-; ARC15-LABEL: 'fun15'
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = udiv <2 x i64> %a, %b
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %r
+; Z17-LABEL: 'fun15'
+; Z17-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = udiv <2 x i64> %a, %b
+; Z17-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %r
 ;
   %r = udiv <2 x i64> %a, %b
   ret <2 x i64> %r
@@ -180,9 +180,9 @@ define <4 x i32> @fun16(<4 x i32> %a, <4 x i32> %b) {
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %r = udiv <4 x i32> %a, %b
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %r
 ;
-; ARC15-LABEL: 'fun16'
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = udiv <4 x i32> %a, %b
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %r
+; Z17-LABEL: 'fun16'
+; Z17-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = udiv <4 x i32> %a, %b
+; Z17-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %r
 ;
   %r = udiv <4 x i32> %a, %b
   ret <4 x i32> %r
@@ -193,9 +193,9 @@ define <2 x i32> @fun17(<2 x i32> %a, <2 x i32> %b) {
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r = udiv <2 x i32> %a, %b
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %r
 ;
-; ARC15-LABEL: 'fun17'
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = udiv <2 x i32> %a, %b
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %r
+; Z17-LABEL: 'fun17'
+; Z17-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = udiv <2 x i32> %a, %b
+; Z17-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %r
 ;
   %r = udiv <2 x i32> %a, %b
   ret <2 x i32> %r
@@ -282,9 +282,9 @@ define <2 x i64> @fun26(<2 x i64> %a, <2 x i64> %b) {
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r = srem <2 x i64> %a, %b
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %r
 ;
-; ARC15-LABEL: 'fun26'
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = srem <2 x i64> %a, %b
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %r
+; Z17-LABEL: 'fun26'
+; Z17-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = srem <2 x i64> %a, %b
+; Z17-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %r
 ;
   %r = srem <2 x i64> %a, %b
   ret <2 x i64> %r
@@ -295,9 +295,9 @@ define <4 x i32> @fun27(<4 x i32> %a, <4 x i32> %b) {
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %r = srem <4 x i32> %a, %b
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %r
 ;
-; ARC15-LABEL: 'fun27'
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = srem <4 x i32> %a, %b
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %r
+; Z17-LABEL: 'fun27'
+; Z17-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = srem <4 x i32> %a, %b
+; Z17-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %r
 ;
   %r = srem <4 x i32> %a, %b
   ret <4 x i32> %r
@@ -308,9 +308,9 @@ define <2 x i32> @fun28(<2 x i32> %a, <2 x i32> %b) {
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r = srem <2 x i32> %a, %b
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %r
 ;
-; ARC15-LABEL: 'fun28'
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = srem <2 x i32> %a, %b
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %r
+; Z17-LABEL: 'fun28'
+; Z17-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = srem <2 x i32> %a, %b
+; Z17-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %r
 ;
   %r = srem <2 x i32> %a, %b
   ret <2 x i32> %r
@@ -397,9 +397,9 @@ define <2 x i64> @fun37(<2 x i64> %a, <2 x i64> %b) {
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r = urem <2 x i64> %a, %b
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %r
 ;
-; ARC15-LABEL: 'fun37'
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = urem <2 x i64> %a, %b
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %r
+; Z17-LABEL: 'fun37'
+; Z17-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = urem <2 x i64> %a, %b
+; Z17-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %r
 ;
   %r = urem <2 x i64> %a, %b
   ret <2 x i64> %r
@@ -410,9 +410,9 @@ define <4 x i32> @fun38(<4 x i32> %a, <4 x i32> %b) {
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %r = urem <4 x i32> %a, %b
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %r
 ;
-; ARC15-LABEL: 'fun38'
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = urem <4 x i32> %a, %b
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %r
+; Z17-LABEL: 'fun38'
+; Z17-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = urem <4 x i32> %a, %b
+; Z17-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %r
 ;
   %r = urem <4 x i32> %a, %b
   ret <4 x i32> %r
@@ -423,9 +423,9 @@ define <2 x i32> @fun39(<2 x i32> %a, <2 x i32> %b) {
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r = urem <2 x i32> %a, %b
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %r
 ;
-; ARC15-LABEL: 'fun39'
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = urem <2 x i32> %a, %b
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %r
+; Z17-LABEL: 'fun39'
+; Z17-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = urem <2 x i32> %a, %b
+; Z17-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %r
 ;
   %r = urem <2 x i32> %a, %b
   ret <2 x i32> %r
@@ -473,9 +473,9 @@ define <8 x i64> @fun44(<8 x i64> %a, <8 x i64> %b) {
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 1000 for instruction: %r = sdiv <8 x i64> %a, %b
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %r
 ;
-; ARC15-LABEL: 'fun44'
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r = sdiv <8 x i64> %a, %b
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %r
+; Z17-LABEL: 'fun44'
+; Z17-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r = sdiv <8 x i64> %a, %b
+; Z17-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %r
 ;
   %r = sdiv <8 x i64> %a, %b
   ret <8 x i64> %r
@@ -486,9 +486,9 @@ define <8 x i32> @fun45(<8 x i32> %a, <8 x i32> %b) {
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 1000 for instruction: %r = urem <8 x i32> %a, %b
 ; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %r
 ;
-; ARC15-LABEL: 'fun45'
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r = urem <8 x i32> %a, %b
-; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %r
+; Z17-LABEL: 'fun45'
+; Z17-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r = urem <8 x i32> %a, %b
+; Z17-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %r
 ;
   %r = urem <8 x i32> %a, %b
   ret <8 x i32> %r
diff --git a/llvm/test/Analysis/CostModel/SystemZ/i128-cmp-ext-conv.ll b/llvm/test/Analysis/CostModel/SystemZ/i128-cmp-ext-conv.ll
index 105e634cea1ac..ba86c9ab1d702 100644
--- a/llvm/test/Analysis/CostModel/SystemZ/i128-cmp-ext-conv.ll
+++ b/llvm/test/Analysis/CostModel/SystemZ/i128-cmp-ext-conv.ll
@@ -1,12 +1,12 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s --check-prefixes=CHECK,Z13
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=arch15 | FileCheck %s --check-prefixes=CHECK,ARC15
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z17 | FileCheck %s --check-prefixes=CHECK,Z17
 ;
 
 define i128 @fun1(i128 %val1, i128 %val2) {
 ; CHECK-LABEL: 'fun1'
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %cmp = icmp eq i128 %val1, %val2
 ; Z13:   Cost Model: Found an estimated cost of 5 for instruction:   %v128 = sext i1 %cmp to i128
-; ARC15: Cost Model: Found an estimated cost of 0 for instruction:   %v128 = sext i1 %cmp to i128
+; Z17:   Cost Model: Found an estimated cost of 0 for instruction:   %v128 = sext i1 %cmp to i128
   %cmp = icmp eq i128 %val1, %val2
   %v128 = sext i1 %cmp to i128
   ret i128 %v128
@@ -27,7 +27,7 @@ define i128 @fun3(i128 %val1, i128 %val2,
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %cmp = icmp eq i128 %val1, %val2
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %add = add i128 %val3, %val4
 ; Z13:   Cost Model: Found an estimated cost of 4 for instruction:   %sel = select i1 %cmp, i128 %val3, i128 %add
-; ARC15: Cost Model: Found an estimated cost of 1 for instruction:   %sel = select i1 %cmp, i128 %val3, i128 %add
+; Z17:   Cost Model: Found an estimated cost of 1 for instruction:   %sel = select i1 %cmp, i128 %val3, i128 %add
   %cmp = icmp eq i128 %val1, %val2
   %add = add i128 %val3, %val4
   %sel = select i1 %cmp, i128 %val3, i128 %add
@@ -40,7 +40,7 @@ define i64 @fun3_sel64(i128 %val1, i128 %val2,
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %cmp = icmp ugt i128 %val1, %val2
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %add = add i64 %val3, %val4
 ; Z13:   Cost Model: Found an estimated cost of 4 for instruction:   %sel = select i1 %cmp, i64 %val3, i64 %add
-; ARC15: Cost Model: Found an estimated cost of 1 for instruction:   %sel = select i1 %cmp, i64 %val3, i64 %add
+; Z17:   Cost Model: Found an estimated cost of 1 for instruction:   %sel = select i1 %cmp, i64 %val3, i64 %add
   %cmp = icmp ugt i128 %val1, %val2
   %add = add i64 %val3, %val4
   %sel = select i1 %cmp, i64 %val3, i64 %add
diff --git a/llvm/test/Analysis/CostModel/SystemZ/int-arith.ll b/llvm/test/Analysis/CostModel/SystemZ/int-arith.ll
index bf5cbfb48a77b..ebeb2df281237 100644
--- a/llvm/test/Analysis/CostModel/SystemZ/int-arith.ll
+++ b/llvm/test/Analysis/CostModel/SystemZ/int-arith.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=arch15 | FileCheck %s -check-prefix=ARC15
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z17 | FileCheck %s -check-prefix=Z17
 ;
 ; Note: The scalarized vector instructions costs are not including any
 ; extracts, due to the undef operands.
@@ -132,22 +132,22 @@ define void @mul() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = mul <2 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = mul <2 x i32> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %res7 = mul <2 x i64> undef, undef
-; ARC15: Cost Model: Found an estimated cost of 1 for instruction:   %res7 = mul <2 x i64> undef, undef
+; Z17:   Cost Model: Found an estimated cost of 1 for instruction:   %res7 = mul <2 x i64> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res8 = mul <4 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res9 = mul <4 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res10 = mul <4 x i32> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res11 = mul <4 x i64> undef, undef
-; ARC15: Cost Model: Found an estimated cost of 2 for instruction:   %res11 = mul <4 x i64> undef, undef
+; Z17:   Cost Model: Found an estimated cost of 2 for instruction:   %res11 = mul <4 x i64> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res12 = mul <8 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res13 = mul <8 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res14 = mul <8 x i32> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %res15 = mul <8 x i64> undef, undef
-; ARC15: Cost Model: Found an estimated cost of 4 for instruction:   %res15 = mul <8 x i64> undef, undef
+; Z17:   Cost Model: Found an estimated cost of 4 for instruction:   %res15 = mul <8 x i64> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res16 = mul <16 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res17 = mul <16 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res18 = mul <16 x i32> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %res19 = mul <16 x i64> undef, undef
-; ARC15: Cost Model: Found an estimated cost of 8 for instruction:   %res19 = mul <16 x i64> undef, undef
+; Z17:   Cost Model: Found an estimated cost of 8 for instruction:   %res19 = mul <16 x i64> undef, undef
 
   ret void;
 }
diff --git a/llvm/test/CodeGen/AArch64/framelayout-scavengingslot-stack-hazard.mir b/llvm/test/CodeGen/AArch64/framelayout-scavengingslot-stack-hazard.mir
new file mode 100644
index 0000000000000..52ac36f801854
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/framelayout-scavengingslot-stack-hazard.mir
@@ -0,0 +1,99 @@
+# RUN: llc -mtriple=aarch64-none-linux-gnu -aarch64-stack-hazard-size=1024 -run-pass=prologepilog %s -o - | FileCheck %s
+--- |
+
+  define void @stack_hazard_streaming_compat() "aarch64_pstate_sm_compatible" { entry: unreachable }
+  define void @stack_hazard_streaming_compat_emergency_spill_slot() "aarch64_pstate_sm_compatible" { entry: unreachable }
+
+...
+
+# +------------------+
+# | GPR callee-saves |
+# +------------------+ <- FP
+# | <hazard padding> |
+# +------------------+
+# | FPR locals       |
+# | %stack.1         |
+# +------------------+
+# | <hazard padding> |
+# +------------------+
+# | GPR locals       |
+# | %stack.2         |
+# | <emergency spill>|
+# +------------------+ <- BP
+# | <VLA>            |
+# +------------------+ <- SP (can't be used due to VLA)
+
+# In this case without the base pointer we'd need the emergency spill slot to
+# access both %stack.1 and %stack.2. With the base pointer we can reach both
+# without spilling.
+
+name: stack_hazard_streaming_compat
+# CHECK-LABEL: name: stack_hazard_streaming_compat
+# CHECK: bb.0:
+# CHECK:      STRDui $d0, $x19, 131
+# CHECK-NEXT: STRXui $x0, $x19, 1
+# CHECK: bb.1:
+tracksRegLiveness: true
+frameInfo:
+  isFrameAddressTaken: true
+stack:
+  - { id: 0, type: variable-sized,  alignment: 1 }
+  - { id: 1, size: 8, alignment: 8 }
+  - { id: 2, size: 8, alignment: 8 }
+body: |
+  bb.0:
+    liveins: $x0, $x8, $d0
+    $x9 = LDRXui $x0, 0 :: (load (s64))
+    STRDui $d0, %stack.1, 0 :: (store (s64) into %stack.1)
+    STRXui $x0, %stack.2, 0 :: (store (s64) into %stack.2)
+    B %bb.1
+  bb.1:
+    liveins: $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x16, $x17, $x18, $x19, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $lr
+    RET_ReallyLR implicit $x19, implicit $x20, implicit $x21, implicit $x22, implicit $x23, implicit $x24, implicit $x25, implicit $x26, implicit $x27, implicit $x28, implicit $lr
+...
+---
+# +------------------+
+# | GPR callee-saves |
+# +------------------+ <- FP
+# | <hazard padding> |
+# +------------------+
+# | FPR locals       |
+# | %stack.1         |
+# +------------------+
+# | <hazard padding> |
+# +------------------+
+# | GPR locals       |
+# | %stack.2         | (very large)
+# | <emergency spill>|
+# +------------------+ <- BP
+# | <VLA>            |
+# +------------------+ <- SP (can't be used due to VLA)
+
+# In this case we need to use the emergency spill slot to access %stack.1 as it
+# is too far from the frame pointer and the base pointer to directly address.
+# Note: This also tests that the <emergency spill> located near the SP/BP.
+
+name: stack_hazard_streaming_compat_emergency_spill_slot
+# CHECK-LABEL: name: stack_hazard_streaming_compat_emergency_spill_slot
+# CHECK: bb.0:
+# CHECK:      STRXui killed $[[SCRATCH:x[0-9]+]], $x19, 0
+# CHECK-NEXT: $[[SCRATCH]] = ADDXri $x19, 1056, 0
+# CHECK-NEXT: STRDui $d0, killed $[[SCRATCH]], 4095
+# CHECK-NEXT: $[[SCRATCH]] = LDRXui $x19, 0
+# CHECK: bb.1:
+tracksRegLiveness: true
+frameInfo:
+  isFrameAddressTaken: true
+stack:
+  - { id: 0, type: variable-sized,  alignment: 1 }
+  - { id: 1, size: 8, alignment: 8 }
+  - { id: 2, size: 32761, alignment: 8 }
+body: |
+  bb.0:
+    liveins: $x0, $x8, $d0
+    $x9 = LDRXui $x0, 0 :: (load (s64))
+    STRDui $d0, %stack.1, 0 :: (store (s64) into %stack.1)
+    B %bb.1
+  bb.1:
+    liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x16, $x17, $x18, $x19, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $lr
+    RET_ReallyLR implicit $x19, implicit $x20, implicit $x21, implicit $x22, implicit $x23, implicit $x24, implicit $x25, implicit $x26, implicit $x27, implicit $x28, implicit $lr
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-cfi.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-cfi.ll
index e7de54036245a..4bbbe40176313 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-cfi.ll
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-cfi.ll
@@ -9,9 +9,9 @@ define void @a() "sign-return-address"="all" "sign-return-address-key"="b_key" {
 ; CHECK-LABEL:         a:                     // @a
 ; CHECK:               // %bb.0:
 ; CHECK-NEXT:          .cfi_b_key_frame
-; CHECK-NEXT:          .cfi_negate_ra_state
 ; V8A-NEXT:            hint #27
 ; V83A-NEXT:           pacibsp
+; CHECK-NEXT:          .cfi_negate_ra_state
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
   %3 = alloca i32, align 4
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-diff-scope-same-key.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-diff-scope-same-key.ll
index a26dda1d5c1f1..6a11bef08c740 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-diff-scope-same-key.ll
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-diff-scope-same-key.ll
@@ -5,9 +5,9 @@
 
 define void @a() "sign-return-address"="all" {
 ; CHECK-LABEL:      a:                                     // @a
-; CHECK:      .cfi_negate_ra_state
-; V8A-NEXT:              hint #25
-; V83A-NEXT:             paciasp
+; V8A:              hint #25
+; V83A:             paciasp
+; CHECK-NEXT:      .cfi_negate_ra_state
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
   %3 = alloca i32, align 4
@@ -52,9 +52,9 @@ define void @b() "sign-return-address"="non-leaf" {
 
 define void @c() "sign-return-address"="all" {
 ; CHECK-LABEL:         c:              // @c
-; CHECK:      .cfi_negate_ra_state
-; V8A-NEXT:              hint #25
-; V83A-NEXT:             paciasp
+; V8A:                 hint #25
+; V83A:                paciasp
+; CHECK-NEXT          .cfi_negate_ra_state
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
   %3 = alloca i32, align 4
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-non-leaf.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-non-leaf.ll
index 064b2b78c7bc7..1e7224683c6c8 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-non-leaf.ll
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-non-leaf.ll
@@ -8,8 +8,8 @@ define i64 @a(i64 %x) "sign-return-address"="non-leaf" "sign-return-address-key"
 ; V8A-LABEL: a:
 ; V8A:       // %bb.0:
 ; V8A-NEXT:    .cfi_b_key_frame
-; V8A-NEXT:    .cfi_negate_ra_state
 ; V8A-NEXT:    hint #27
+; V8A-NEXT:    .cfi_negate_ra_state
 ; V8A-NEXT:    sub sp, sp, #32
 ; V8A-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 32
@@ -26,8 +26,8 @@ define i64 @a(i64 %x) "sign-return-address"="non-leaf" "sign-return-address-key"
 ; V83A-LABEL: a:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    .cfi_b_key_frame
-; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    pacibsp
+; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    sub sp, sp, #32
 ; V83A-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 32
@@ -59,8 +59,8 @@ define i64 @b(i64 %x) "sign-return-address"="non-leaf" "sign-return-address-key"
 ; V8A-LABEL: b:
 ; V8A:       // %bb.0:
 ; V8A-NEXT:    .cfi_b_key_frame
-; V8A-NEXT:    .cfi_negate_ra_state
 ; V8A-NEXT:    hint #27
+; V8A-NEXT:    .cfi_negate_ra_state
 ; V8A-NEXT:    sub sp, sp, #32
 ; V8A-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 32
@@ -77,8 +77,8 @@ define i64 @b(i64 %x) "sign-return-address"="non-leaf" "sign-return-address-key"
 ; V83A-LABEL: b:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    .cfi_b_key_frame
-; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    pacibsp
+; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    sub sp, sp, #32
 ; V83A-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 32
@@ -110,8 +110,8 @@ define i64 @c(i64 %x) "sign-return-address"="non-leaf" "sign-return-address-key"
 ; V8A-LABEL: c:
 ; V8A:       // %bb.0:
 ; V8A-NEXT:    .cfi_b_key_frame
-; V8A-NEXT:    .cfi_negate_ra_state
 ; V8A-NEXT:    hint #27
+; V8A-NEXT:    .cfi_negate_ra_state
 ; V8A-NEXT:    sub sp, sp, #32
 ; V8A-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 32
@@ -128,8 +128,8 @@ define i64 @c(i64 %x) "sign-return-address"="non-leaf" "sign-return-address-key"
 ; V83A-LABEL: c:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    .cfi_b_key_frame
-; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    pacibsp
+; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    sub sp, sp, #32
 ; V83A-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 32
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-regsave.mir b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-regsave.mir
index 218ee6609c803..9a983cbd6714e 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-regsave.mir
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-regsave.mir
@@ -81,8 +81,8 @@ body:             |
 # CHECK:         name:            bar
 # CHECK:          bb.0:
 # CHECK:            frame-setup EMITBKEY
-# CHECK-NEXT:       frame-setup CFI_INSTRUCTION negate_ra_sign_state
 # CHECK-NEXT:       frame-setup PACIBSP implicit-def $lr, implicit $lr, implicit $sp
+# CHECK-NEXT:       frame-setup CFI_INSTRUCTION negate_ra_sign_state
 # CHECK-NOT:        OUTLINED_FUNCTION_
 # CHECK:          bb.1:
 # CHECK-NOT:        OUTLINED_FUNCTION_
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-same-scope-diff-key.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-same-scope-diff-key.ll
index 5c45373d8c1d6..87771f5de4f69 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-same-scope-diff-key.ll
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-same-scope-diff-key.ll
@@ -7,8 +7,8 @@
 define void @a() "sign-return-address"="all" {
 ; V8A-LABEL: a:
 ; V8A:       // %bb.0:
-; V8A-NEXT:    .cfi_negate_ra_state
 ; V8A-NEXT:    hint #25
+; V8A-NEXT:    .cfi_negate_ra_state
 ; V8A-NEXT:    sub sp, sp, #32
 ; V8A-NEXT:    .cfi_def_cfa_offset 32
 ; V8A-NEXT:    mov w8, #1 // =0x1
@@ -26,8 +26,8 @@ define void @a() "sign-return-address"="all" {
 ;
 ; V83A-LABEL: a:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    sub sp, sp, #32
 ; V83A-NEXT:    .cfi_def_cfa_offset 32
 ; V83A-NEXT:    mov w8, #1 // =0x1
@@ -60,8 +60,8 @@ define void @b() "sign-return-address"="all" "sign-return-address-key"="b_key" {
 ; V8A-LABEL: b:
 ; V8A:       // %bb.0:
 ; V8A-NEXT:    .cfi_b_key_frame
-; V8A-NEXT:    .cfi_negate_ra_state
 ; V8A-NEXT:    hint #27
+; V8A-NEXT:    .cfi_negate_ra_state
 ; V8A-NEXT:    sub sp, sp, #32
 ; V8A-NEXT:    .cfi_def_cfa_offset 32
 ; V8A-NEXT:    mov w8, #1 // =0x1
@@ -80,8 +80,8 @@ define void @b() "sign-return-address"="all" "sign-return-address-key"="b_key" {
 ; V83A-LABEL: b:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    .cfi_b_key_frame
-; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    pacibsp
+; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    sub sp, sp, #32
 ; V83A-NEXT:    .cfi_def_cfa_offset 32
 ; V83A-NEXT:    mov w8, #1 // =0x1
@@ -113,8 +113,8 @@ define void @b() "sign-return-address"="all" "sign-return-address-key"="b_key" {
 define void @c() "sign-return-address"="all" {
 ; V8A-LABEL: c:
 ; V8A:       // %bb.0:
-; V8A-NEXT:    .cfi_negate_ra_state
 ; V8A-NEXT:    hint #25
+; V8A-NEXT:    .cfi_negate_ra_state
 ; V8A-NEXT:    sub sp, sp, #32
 ; V8A-NEXT:    .cfi_def_cfa_offset 32
 ; V8A-NEXT:    mov w8, #1 // =0x1
@@ -132,8 +132,8 @@ define void @c() "sign-return-address"="all" {
 ;
 ; V83A-LABEL: c:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    sub sp, sp, #32
 ; V83A-NEXT:    .cfi_def_cfa_offset 32
 ; V83A-NEXT:    mov w8, #1 // =0x1
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-sp-mod.mir b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-sp-mod.mir
index d4a4b886ec0e3..22e5edef2a939 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-sp-mod.mir
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-sp-mod.mir
@@ -86,11 +86,11 @@ body:             |
 # CHECK:          body:             |
 # CHECK-NEXT:         bb.0 (%ir-block.0):
 # CHECK-NEXT:           liveins: $lr
-# CHECK:                frame-setup CFI_INSTRUCTION negate_ra_sign_state
-# CHECK-NEXT:           frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
+# CHECK:                frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
+# CHECK-NEXT:           frame-setup CFI_INSTRUCTION negate_ra_sign_state
 # CHECK:                BL @[[OUTLINED_FUNC:OUTLINED_FUNCTION_[0-9]+]]
-# CHECK:                frame-destroy CFI_INSTRUCTION negate_ra_sign_state
-# CHECK-NEXT:           frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
+# CHECK:                frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
+# CHECK-NEXT:           frame-destroy CFI_INSTRUCTION negate_ra_sign_state
 # CHECK-NEXT:           RET undef $lr
 
 ...
@@ -119,11 +119,11 @@ body:             |
 # CHECK:          body:             |
 # CHECK-NEXT:         bb.0 (%ir-block.0):
 # CHECK-NEXT:           liveins: $lr
-# CHECK:                frame-setup CFI_INSTRUCTION negate_ra_sign_state
-# CHECK-NEXT:           frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
+# CHECK:                frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
+# CHECK-NEXT:           frame-setup CFI_INSTRUCTION negate_ra_sign_state
 # CHECK:                BL @[[OUTLINED_FUNC]]
-# CHECK:                frame-destroy CFI_INSTRUCTION negate_ra_sign_state
-# CHECK-NEXT:           frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
+# CHECK:                frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
+# CHECK-NEXT:           frame-destroy CFI_INSTRUCTION negate_ra_sign_state
 # CHECK-NEXT:           RET undef $lr
 
 ...
@@ -174,22 +174,22 @@ body:             |
 # CHECK:          body:             |
 # CHECK-NEXT:         bb.0 (%ir-block.0):
 # CHECK-NEXT:           liveins: $lr
-# CHECK:                frame-setup CFI_INSTRUCTION negate_ra_sign_state
-# CHECK-NEXT:           frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
+# CHECK:                frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
+# CHECK-NEXT:           frame-setup CFI_INSTRUCTION negate_ra_sign_state
 # CHECK-NOT:            BL @OUTLINED_FUNCTION_{{.*}}
-# CHECK:                frame-destroy CFI_INSTRUCTION negate_ra_sign_state
-# CHECK-NEXT:           frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
+# CHECK:                frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
+# CHECK-NEXT:           frame-destroy CFI_INSTRUCTION negate_ra_sign_state
 # CHECK-NEXT:           RET undef $lr
 
 # CHECK-LABEL:    name:            illegal1
 # CHECK:          body:             |
 # CHECK-NEXT:         bb.0 (%ir-block.0):
 # CHECK-NEXT:           liveins: $lr
-# CHECK:                frame-setup CFI_INSTRUCTION negate_ra_sign_state
-# CHECK-NEXT:           frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
+# CHECK:                frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
+# CHECK-NEXT:           frame-setup CFI_INSTRUCTION negate_ra_sign_state
 # CHECK-NOT:            BL @OUTLINED_FUNCTION_{{.*}}
-# CHECK:                frame-destroy CFI_INSTRUCTION negate_ra_sign_state
-# CHECK-NEXT:           frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
+# CHECK:                frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
+# CHECK-NEXT:           frame-destroy CFI_INSTRUCTION negate_ra_sign_state
 # CHECK-NEXT:           RET undef $lr
 
 # Outlined function that contains only legal sp modifications
@@ -198,8 +198,8 @@ body:             |
 # CHECK-NEXT:       bb.0:
 # CHECK-NEXT: liveins: $lr
 # CHECK-NEXT: {{^  $}}
-# CHECK-NEXT:         frame-setup CFI_INSTRUCTION negate_ra_sign_state
 # CHECK-NEXT:         frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
+# CHECK-NEXT:         frame-setup CFI_INSTRUCTION negate_ra_sign_state
 # CHECK-NEXT:         $sp = frame-setup SUBXri $sp, 16, 0
 # CHECK:              $sp = frame-destroy ADDXri $sp, 16, 0
 # CHECK-NEXT:         frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-subtarget.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-subtarget.ll
index cb43b3ba3e47e..a7ea32952f3b7 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-subtarget.ll
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-subtarget.ll
@@ -9,8 +9,8 @@ define void @a() #0 {
 ; CHECK-LABEL:      a:                                     // @a
 ; CHECK:            // %bb.0:
 ; CHECK-NEXT:               .cfi_b_key_frame
-; CHECK-NEXT:               .cfi_negate_ra_state
 ; CHECK-NEXT:               pacibsp
+; CHECK-NEXT:               .cfi_negate_ra_state
 ; CHECK-NOT:                OUTLINED_FUNCTION_
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
@@ -33,8 +33,8 @@ define void @b() #0 {
 ; CHECK-LABEL:      b:                                     // @b
 ; CHECK:            // %bb.0:
 ; CHECK-NEXT:               .cfi_b_key_frame
-; CHECK-NEXT:               .cfi_negate_ra_state
 ; CHECK-NEXT:               pacibsp
+; CHECK-NEXT:               .cfi_negate_ra_state
 ; CHECK-NOT:                OUTLINED_FUNCTION_
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
@@ -57,8 +57,8 @@ define void @c() #1 {
 ; CHECK-LABEL:      c:                                     // @c
 ; CHECK:            // %bb.0:
 ; CHECK-NEXT:               .cfi_b_key_frame
-; CHECK-NEXT:               .cfi_negate_ra_state
 ; CHECK-NEXT:               hint #27
+; CHECK-NEXT:               .cfi_negate_ra_state
 ; CHECK-NOT:                OUTLINED_FUNCTION_
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-thunk.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-thunk.ll
index 0ba4455532925..da68ea5bf0dbc 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-thunk.ll
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-thunk.ll
@@ -10,8 +10,8 @@ declare i32 @thunk_called_fn(i32, i32, i32, i32)
 define i32 @a() #0 {
 ; V8A-LABEL: a:
 ; V8A:       // %bb.0: // %entry
-; V8A-NEXT:    .cfi_negate_ra_state
 ; V8A-NEXT:    hint #25
+; V8A-NEXT:    .cfi_negate_ra_state
 ; V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 16
 ; V8A-NEXT:    .cfi_offset w30, -16
@@ -27,8 +27,8 @@ define i32 @a() #0 {
 ;
 ; V83A-LABEL: a:
 ; V83A:       // %bb.0: // %entry
-; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -49,8 +49,8 @@ entry:
 define i32 @b() #0 {
 ; V8A-LABEL: b:
 ; V8A:       // %bb.0: // %entry
-; V8A-NEXT:    .cfi_negate_ra_state
 ; V8A-NEXT:    hint #25
+; V8A-NEXT:    .cfi_negate_ra_state
 ; V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 16
 ; V8A-NEXT:    .cfi_offset w30, -16
@@ -66,8 +66,8 @@ define i32 @b() #0 {
 ;
 ; V83A-LABEL: b:
 ; V83A:       // %bb.0: // %entry
-; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -88,8 +88,8 @@ entry:
 define hidden i32 @c(ptr %fptr) #0 {
 ; V8A-LABEL: c:
 ; V8A:       // %bb.0: // %entry
-; V8A-NEXT:    .cfi_negate_ra_state
 ; V8A-NEXT:    hint #25
+; V8A-NEXT:    .cfi_negate_ra_state
 ; V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 16
 ; V8A-NEXT:    .cfi_offset w30, -16
@@ -106,8 +106,8 @@ define hidden i32 @c(ptr %fptr) #0 {
 ;
 ; V83A-LABEL: c:
 ; V83A:       // %bb.0: // %entry
-; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -129,8 +129,8 @@ entry:
 define hidden i32 @d(ptr %fptr) #0 {
 ; V8A-LABEL: d:
 ; V8A:       // %bb.0: // %entry
-; V8A-NEXT:    .cfi_negate_ra_state
 ; V8A-NEXT:    hint #25
+; V8A-NEXT:    .cfi_negate_ra_state
 ; V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 16
 ; V8A-NEXT:    .cfi_offset w30, -16
@@ -147,8 +147,8 @@ define hidden i32 @d(ptr %fptr) #0 {
 ;
 ; V83A-LABEL: d:
 ; V83A:       // %bb.0: // %entry
-; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -176,5 +176,3 @@ attributes #0 = { "sign-return-address"="non-leaf" minsize }
 ; CHECK-NOT:         .cfi_negate_ra_state
 ; CHECK-NOT:         auti{{[a,b]}}sp
 ; CHECK-NOT:         hint #{{[29,31]}}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/pacbti-llvm-generated-funcs-2.ll b/llvm/test/CodeGen/AArch64/pacbti-llvm-generated-funcs-2.ll
index f823d2aa82ac0..373c4969a9405 100644
--- a/llvm/test/CodeGen/AArch64/pacbti-llvm-generated-funcs-2.ll
+++ b/llvm/test/CodeGen/AArch64/pacbti-llvm-generated-funcs-2.ll
@@ -34,8 +34,8 @@ entry:
 }
 ;; CHECK-LABEL: __llvm_gcov_writeout:
 ;; CHECK:       .cfi_b_key_frame
-;; CHECK-NEXT:  .cfi_negate_ra_state
 ;; CHECK-NEXT:  pacibsp
+;; CHECK-NEXT:  .cfi_negate_ra_state
 
 define internal void @__llvm_gcov_reset() unnamed_addr #2 {
 entry:
@@ -54,9 +54,9 @@ entry:
 }
 ;; CHECK-LABEL: __llvm_gcov_init:
 ;; CHECK:      .cfi_b_key_frame
+;; CHECK-NEXT:  pacibsp
 ;; CHECK-NEXT:  .cfi_negate_ra_state
 ;; CHECK-NOT:   .cfi_
-;; CHECK-NEXT:  pacibsp
 ;; CHECK:       .cfi_endproc
 
 attributes #0 = { norecurse nounwind readnone "sign-return-address"="all" "sign-return-address-key"="b_key" }
diff --git a/llvm/test/CodeGen/AArch64/sign-return-address-cfi-negate-ra-state.ll b/llvm/test/CodeGen/AArch64/sign-return-address-cfi-negate-ra-state.ll
index 6ea072846d47c..4d4b7c215b978 100644
--- a/llvm/test/CodeGen/AArch64/sign-return-address-cfi-negate-ra-state.ll
+++ b/llvm/test/CodeGen/AArch64/sign-return-address-cfi-negate-ra-state.ll
@@ -9,8 +9,8 @@
 define dso_local i32 @_Z3fooi(i32 %x) #0 {
 ; CHECK-V8A-LABEL: _Z3fooi:
 ; CHECK-V8A:       // %bb.0: // %entry
-; CHECK-V8A-NEXT:    .cfi_negate_ra_state
 ; CHECK-V8A-NEXT:    hint #25
+; CHECK-V8A-NEXT:    .cfi_negate_ra_state
 ; CHECK-V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-V8A-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-V8A-NEXT:    .cfi_offset w30, -16
@@ -27,8 +27,8 @@ define dso_local i32 @_Z3fooi(i32 %x) #0 {
 ;
 ; CHECK-V83A-LABEL: _Z3fooi:
 ; CHECK-V83A:       // %bb.0: // %entry
-; CHECK-V83A-NEXT:    .cfi_negate_ra_state
 ; CHECK-V83A-NEXT:    paciasp
+; CHECK-V83A-NEXT:    .cfi_negate_ra_state
 ; CHECK-V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-V83A-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-V83A-NEXT:    .cfi_offset w30, -16
@@ -62,8 +62,8 @@ return:                                           ; No predecessors!
 define hidden noundef i32 @baz_async(i32 noundef %a) #0 uwtable(async) {
 ; CHECK-V8A-LABEL: baz_async:
 ; CHECK-V8A:       // %bb.0: // %entry
-; CHECK-V8A-NEXT:    .cfi_negate_ra_state
 ; CHECK-V8A-NEXT:    hint #25
+; CHECK-V8A-NEXT:    .cfi_negate_ra_state
 ; CHECK-V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-V8A-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-V8A-NEXT:    .cfi_offset w30, -16
@@ -74,8 +74,8 @@ define hidden noundef i32 @baz_async(i32 noundef %a) #0 uwtable(async) {
 ; CHECK-V8A-NEXT:    bl _Z3bari
 ; CHECK-V8A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-V8A-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-V8A-NEXT:    .cfi_negate_ra_state
 ; CHECK-V8A-NEXT:    hint #29
+; CHECK-V8A-NEXT:    .cfi_negate_ra_state
 ; CHECK-V8A-NEXT:    .cfi_restore w30
 ; CHECK-V8A-NEXT:    b _Z3bari
 ; CHECK-V8A-NEXT:  .LBB1_2: // %if.else
@@ -84,15 +84,15 @@ define hidden noundef i32 @baz_async(i32 noundef %a) #0 uwtable(async) {
 ; CHECK-V8A-NEXT:    add w0, w0, #1
 ; CHECK-V8A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-V8A-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-V8A-NEXT:    .cfi_negate_ra_state
 ; CHECK-V8A-NEXT:    hint #29
+; CHECK-V8A-NEXT:    .cfi_negate_ra_state
 ; CHECK-V8A-NEXT:    .cfi_restore w30
 ; CHECK-V8A-NEXT:    ret
 ;
 ; CHECK-V83A-LABEL: baz_async:
 ; CHECK-V83A:       // %bb.0: // %entry
-; CHECK-V83A-NEXT:    .cfi_negate_ra_state
 ; CHECK-V83A-NEXT:    paciasp
+; CHECK-V83A-NEXT:    .cfi_negate_ra_state
 ; CHECK-V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-V83A-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-V83A-NEXT:    .cfi_offset w30, -16
@@ -103,8 +103,8 @@ define hidden noundef i32 @baz_async(i32 noundef %a) #0 uwtable(async) {
 ; CHECK-V83A-NEXT:    bl _Z3bari
 ; CHECK-V83A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-V83A-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-V83A-NEXT:    .cfi_negate_ra_state
 ; CHECK-V83A-NEXT:    autiasp
+; CHECK-V83A-NEXT:    .cfi_negate_ra_state
 ; CHECK-V83A-NEXT:    .cfi_restore w30
 ; CHECK-V83A-NEXT:    b _Z3bari
 ; CHECK-V83A-NEXT:  .LBB1_2: // %if.else
@@ -143,8 +143,8 @@ return:                                           ; preds = %if.else, %if.then
 define hidden noundef i32 @baz_sync(i32 noundef %a) #0 uwtable(sync) {
 ; CHECK-V8A-LABEL: baz_sync:
 ; CHECK-V8A:       // %bb.0: // %entry
-; CHECK-V8A-NEXT:    .cfi_negate_ra_state
 ; CHECK-V8A-NEXT:    hint #25
+; CHECK-V8A-NEXT:    .cfi_negate_ra_state
 ; CHECK-V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-V8A-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-V8A-NEXT:    .cfi_offset w30, -16
@@ -164,8 +164,8 @@ define hidden noundef i32 @baz_sync(i32 noundef %a) #0 uwtable(sync) {
 ;
 ; CHECK-V83A-LABEL: baz_sync:
 ; CHECK-V83A:       // %bb.0: // %entry
-; CHECK-V83A-NEXT:    .cfi_negate_ra_state
 ; CHECK-V83A-NEXT:    paciasp
+; CHECK-V83A-NEXT:    .cfi_negate_ra_state
 ; CHECK-V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-V83A-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-V83A-NEXT:    .cfi_offset w30, -16
@@ -216,7 +216,7 @@ attributes #0 = { "sign-return-address"="all" }
 ; CHECK-DUMP-NOT: DW_CFA_remember_state
 ; CHECK-DUMP-NOT: DW_CFA_restore_state
 
-; CHECK-DUMP: CFA=WSP
+; CHECK-DUMP: CFA=WSP{{$}}
 ; CHECK-DUMP: reg34=1
 ; CHECK-DUMP-NOT: reg34=0
 
@@ -229,7 +229,6 @@ attributes #0 = { "sign-return-address"="all" }
 ; CHECK-DUMP:   DW_CFA_restore_state:
 ; CHECK-DUMP:   DW_CFA_AARCH64_negate_ra_state:
 
-; CHECK-DUMP: CFA=WSP
 ;; First DW_CFA_AARCH64_negate_ra_state:
 ; CHECK-DUMP: reg34=1
 ;; Second DW_CFA_AARCH64_negate_ra_state:
@@ -238,6 +237,7 @@ attributes #0 = { "sign-return-address"="all" }
 ; CHECK-DUMP: reg34=1
 ;; Third DW_CFA_AARCH64_negate_ra_state:
 ; CHECK-DUMP: reg34=0
+; CHECK-DUMP-NOT: reg34=1
 
 ; baz_sync
 ; CHECK-DUMP-LABEL: FDE
@@ -246,6 +246,6 @@ attributes #0 = { "sign-return-address"="all" }
 ; CHECK-DUMP-NOT: DW_CFA_remember_state
 ; CHECK-DUMP-NOT: DW_CFA_restore_state
 
-; CHECK-DUMP: CFA=WSP
+; CHECK-DUMP: CFA=WSP{{$}}
 ; CHECK-DUMP: reg34=1
 ; CHECK-DUMP-NOT: reg34=0
diff --git a/llvm/test/CodeGen/AArch64/sign-return-address.ll b/llvm/test/CodeGen/AArch64/sign-return-address.ll
index e0ee0d84ab4f1..dafe0d71ceb5f 100644
--- a/llvm/test/CodeGen/AArch64/sign-return-address.ll
+++ b/llvm/test/CodeGen/AArch64/sign-return-address.ll
@@ -29,15 +29,15 @@ define i32 @leaf_sign_non_leaf(i32 %x) "sign-return-address"="non-leaf"  {
 define i32 @leaf_sign_all(i32 %x) "sign-return-address"="all" {
 ; COMPAT-LABEL: leaf_sign_all:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    .cfi_negate_ra_state
 ; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
 ; COMPAT-NEXT:    hint #29
 ; COMPAT-NEXT:    ret
 ;
 ; V83A-LABEL: leaf_sign_all:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    retaa
   ret i32 %x
 }
@@ -45,8 +45,8 @@ define i32 @leaf_sign_all(i32 %x) "sign-return-address"="all" {
 define i64 @leaf_clobbers_lr(i64 %x) "sign-return-address"="non-leaf"  {
 ; COMPAT-LABEL: leaf_clobbers_lr:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    .cfi_negate_ra_state
 ; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -59,8 +59,8 @@ define i64 @leaf_clobbers_lr(i64 %x) "sign-return-address"="non-leaf"  {
 ;
 ; V83A-LABEL: leaf_clobbers_lr:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -78,8 +78,8 @@ declare i32 @foo(i32)
 define i32 @non_leaf_sign_all(i32 %x) "sign-return-address"="all" {
 ; COMPAT-LABEL: non_leaf_sign_all:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    .cfi_negate_ra_state
 ; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -90,8 +90,8 @@ define i32 @non_leaf_sign_all(i32 %x) "sign-return-address"="all" {
 ;
 ; V83A-LABEL: non_leaf_sign_all:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -105,8 +105,8 @@ define i32 @non_leaf_sign_all(i32 %x) "sign-return-address"="all" {
 define i32 @non_leaf_sign_non_leaf(i32 %x) "sign-return-address"="non-leaf"  {
 ; COMPAT-LABEL: non_leaf_sign_non_leaf:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    .cfi_negate_ra_state
 ; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -117,8 +117,8 @@ define i32 @non_leaf_sign_non_leaf(i32 %x) "sign-return-address"="non-leaf"  {
 ;
 ; V83A-LABEL: non_leaf_sign_non_leaf:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -135,8 +135,8 @@ define i32 @non_leaf_scs(i32 %x) "sign-return-address"="non-leaf" shadowcallstac
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x30, [x18], #8
 ; CHECK-NEXT:    .cfi_escape 0x16, 0x12, 0x02, 0x82, 0x78 //
-; CHECK-NEXT:    .cfi_negate_ra_state
 ; CHECK-NEXT:    paciasp
+; CHECK-NEXT:    .cfi_negate_ra_state
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
@@ -152,8 +152,8 @@ define i32 @non_leaf_scs(i32 %x) "sign-return-address"="non-leaf" shadowcallstac
 define i32 @leaf_sign_all_v83(i32 %x) "sign-return-address"="all" "target-features"="+v8.3a" {
 ; CHECK-LABEL: leaf_sign_all_v83:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    .cfi_negate_ra_state
 ; CHECK-NEXT:    paciasp
+; CHECK-NEXT:    .cfi_negate_ra_state
 ; CHECK-NEXT:    retaa
   ret i32 %x
 }
@@ -163,8 +163,8 @@ declare fastcc i64 @bar(i64)
 define fastcc void @spill_lr_and_tail_call(i64 %x) "sign-return-address"="all" {
 ; COMPAT-LABEL: spill_lr_and_tail_call:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    .cfi_negate_ra_state
 ; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -177,8 +177,8 @@ define fastcc void @spill_lr_and_tail_call(i64 %x) "sign-return-address"="all" {
 ;
 ; V83A-LABEL: spill_lr_and_tail_call:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -196,15 +196,15 @@ define fastcc void @spill_lr_and_tail_call(i64 %x) "sign-return-address"="all" {
 define i32 @leaf_sign_all_a_key(i32 %x) "sign-return-address"="all" "sign-return-address-key"="a_key" {
 ; COMPAT-LABEL: leaf_sign_all_a_key:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    .cfi_negate_ra_state
 ; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
 ; COMPAT-NEXT:    hint #29
 ; COMPAT-NEXT:    ret
 ;
 ; V83A-LABEL: leaf_sign_all_a_key:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    retaa
   ret i32 %x
 }
@@ -213,16 +213,16 @@ define i32 @leaf_sign_all_b_key(i32 %x) "sign-return-address"="all" "sign-return
 ; COMPAT-LABEL: leaf_sign_all_b_key:
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    .cfi_b_key_frame
-; COMPAT-NEXT:    .cfi_negate_ra_state
 ; COMPAT-NEXT:    hint #27
+; COMPAT-NEXT:    .cfi_negate_ra_state
 ; COMPAT-NEXT:    hint #31
 ; COMPAT-NEXT:    ret
 ;
 ; V83A-LABEL: leaf_sign_all_b_key:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    .cfi_b_key_frame
-; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    pacibsp
+; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    retab
   ret i32 %x
 }
@@ -231,8 +231,8 @@ define i32 @leaf_sign_all_v83_b_key(i32 %x) "sign-return-address"="all" "target-
 ; CHECK-LABEL: leaf_sign_all_v83_b_key:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    .cfi_b_key_frame
-; CHECK-NEXT:    .cfi_negate_ra_state
 ; CHECK-NEXT:    pacibsp
+; CHECK-NEXT:    .cfi_negate_ra_state
 ; CHECK-NEXT:    retab
   ret i32 %x
 }
@@ -241,15 +241,15 @@ define i32 @leaf_sign_all_v83_b_key(i32 %x) "sign-return-address"="all" "target-
 define i32 @leaf_sign_all_a_key_bti(i32 %x) "sign-return-address"="all" "sign-return-address-key"="a_key" "branch-target-enforcement"{
 ; COMPAT-LABEL: leaf_sign_all_a_key_bti:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    .cfi_negate_ra_state
 ; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
 ; COMPAT-NEXT:    hint #29
 ; COMPAT-NEXT:    ret
 ;
 ; V83A-LABEL: leaf_sign_all_a_key_bti:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    retaa
   ret i32 %x
 }
@@ -259,16 +259,16 @@ define i32 @leaf_sign_all_b_key_bti(i32 %x) "sign-return-address"="all" "sign-re
 ; COMPAT-LABEL: leaf_sign_all_b_key_bti:
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    .cfi_b_key_frame
-; COMPAT-NEXT:    .cfi_negate_ra_state
 ; COMPAT-NEXT:    hint #27
+; COMPAT-NEXT:    .cfi_negate_ra_state
 ; COMPAT-NEXT:    hint #31
 ; COMPAT-NEXT:    ret
 ;
 ; V83A-LABEL: leaf_sign_all_b_key_bti:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    .cfi_b_key_frame
-; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    pacibsp
+; V83A-NEXT:    .cfi_negate_ra_state
 ; V83A-NEXT:    retab
   ret i32 %x
 }
@@ -278,8 +278,8 @@ define i32 @leaf_sign_all_v83_b_key_bti(i32 %x) "sign-return-address"="all" "tar
 ; CHECK-LABEL: leaf_sign_all_v83_b_key_bti:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    .cfi_b_key_frame
-; CHECK-NEXT:    .cfi_negate_ra_state
 ; CHECK-NEXT:    pacibsp
+; CHECK-NEXT:    .cfi_negate_ra_state
 ; CHECK-NEXT:    retab
   ret i32 %x
 }
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index 33d08beae2ca7..4a52bf27a7591 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -475,16 +475,12 @@ declare double @zt0_shared_callee(double) "aarch64_inout_zt0"
 define double  @zt0_new_caller_to_zt0_shared_callee(double %x) nounwind noinline optnone "aarch64_new_zt0" {
 ; CHECK-COMMON-LABEL: zt0_new_caller_to_zt0_shared_callee:
 ; CHECK-COMMON:       // %bb.0: // %prelude
-; CHECK-COMMON-NEXT:    sub sp, sp, #80
-; CHECK-COMMON-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-COMMON-NEXT:    cbz x8, .LBB13_2
 ; CHECK-COMMON-NEXT:    b .LBB13_1
 ; CHECK-COMMON-NEXT:  .LBB13_1: // %save.za
-; CHECK-COMMON-NEXT:    mov x8, sp
-; CHECK-COMMON-NEXT:    str zt0, [x8]
 ; CHECK-COMMON-NEXT:    bl __arm_tpidr2_save
-; CHECK-COMMON-NEXT:    ldr zt0, [x8]
 ; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK-COMMON-NEXT:    b .LBB13_2
 ; CHECK-COMMON-NEXT:  .LBB13_2: // %entry
@@ -495,8 +491,7 @@ define double  @zt0_new_caller_to_zt0_shared_callee(double %x) nounwind noinline
 ; CHECK-COMMON-NEXT:    fmov d1, x8
 ; CHECK-COMMON-NEXT:    fadd d0, d0, d1
 ; CHECK-COMMON-NEXT:    smstop za
-; CHECK-COMMON-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
-; CHECK-COMMON-NEXT:    add sp, sp, #80
+; CHECK-COMMON-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ret
 entry:
   %call = call double @zt0_shared_callee(double %x)
diff --git a/llvm/test/CodeGen/AArch64/sme-must-save-lr-for-vg.ll b/llvm/test/CodeGen/AArch64/sme-must-save-lr-for-vg.ll
new file mode 100644
index 0000000000000..69f603458670c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-must-save-lr-for-vg.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -O0 < %s | FileCheck %s
+
+; Example of locally streaming function that (at -O0) must preserve the LR (X30)
+; before calling __arm_get_current_vg.
+define void @foo() "aarch64_pstate_sm_body" {
+; CHECK-LABEL: foo:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    lsr x9, x9, #3
+; CHECK-NEXT:    str x9, [sp, #72] // 8-byte Folded Spill
+; CHECK-NEXT:    bl __arm_get_current_vg
+; CHECK-NEXT:    str x0, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset vg, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    .cfi_offset b8, -40
+; CHECK-NEXT:    .cfi_offset b9, -48
+; CHECK-NEXT:    .cfi_offset b10, -56
+; CHECK-NEXT:    .cfi_offset b11, -64
+; CHECK-NEXT:    .cfi_offset b12, -72
+; CHECK-NEXT:    .cfi_offset b13, -80
+; CHECK-NEXT:    .cfi_offset b14, -88
+; CHECK-NEXT:    .cfi_offset b15, -96
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore b8
+; CHECK-NEXT:    .cfi_restore b9
+; CHECK-NEXT:    .cfi_restore b10
+; CHECK-NEXT:    .cfi_restore b11
+; CHECK-NEXT:    .cfi_restore b12
+; CHECK-NEXT:    .cfi_restore b13
+; CHECK-NEXT:    .cfi_restore b14
+; CHECK-NEXT:    .cfi_restore b15
+; CHECK-NEXT:    ret
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sme-new-zt0-function.ll b/llvm/test/CodeGen/AArch64/sme-new-zt0-function.ll
new file mode 100644
index 0000000000000..94968ab4fd9ac
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-new-zt0-function.ll
@@ -0,0 +1,14 @@
+; RUN: opt -S -mtriple=aarch64-linux-gnu -aarch64-sme-abi %s | FileCheck %s
+
+declare void @callee();
+
+define void @private_za() "aarch64_new_zt0" {
+  call void @callee()
+  ret void
+}
+
+; CHECK: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() #[[TPIDR2_SAVE_CALL_ATTR:[0-9]+]]
+; CHECK: declare void @__arm_tpidr2_save() #[[TPIDR2_SAVE_DECL_ATTR:[0-9]+]]
+
+; CHECK: attributes #[[TPIDR2_SAVE_DECL_ATTR]] = { "aarch64_pstate_sm_compatible" }
+; CHECK: attributes #[[TPIDR2_SAVE_CALL_ATTR]] = { "aarch64_zt0_undef" }
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
index 312537630e77a..7361e850d713e 100644
--- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
+++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
@@ -112,7 +112,7 @@ define void @za_zt0_shared_caller_za_zt0_shared_callee() "aarch64_inout_za" "aar
   ret void;
 }
 
-; New-ZA Callee
+; New-ZT0 Callee
 
 ; Expect spill & fill of ZT0 around call
 ; Expect smstop/smstart za around call
@@ -134,6 +134,72 @@ define void @zt0_in_caller_zt0_new_callee() "aarch64_in_zt0" nounwind {
   ret void;
 }
 
+; New-ZT0 Callee
+
+; Expect commit of lazy-save if ZA is dormant
+; Expect smstart ZA & clear ZT0
+; Expect spill & fill of ZT0 around call
+; Before return, expect smstop ZA
+define void @zt0_new_caller_zt0_new_callee() "aarch64_new_zt0" nounwind {
+; CHECK-LABEL: zt0_new_caller_zt0_new_callee:
+; CHECK:       // %bb.0: // %prelude
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    cbz x8, .LBB6_2
+; CHECK-NEXT:  // %bb.1: // %save.za
+; CHECK-NEXT:    bl __arm_tpidr2_save
+; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:  .LBB6_2:
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    zero { zt0 }
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    str zt0, [x19]
+; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    bl callee
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    ldr zt0, [x19]
+; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    ret
+  call void @callee() "aarch64_new_zt0";
+  ret void;
+}
+
+; Expect commit of lazy-save if ZA is dormant
+; Expect smstart ZA & clear ZT0
+; No spill & fill of ZT0 around __arm_tpidr2_save
+; Expect spill & fill of ZT0 around __arm_sme_state call
+; Before return, expect smstop ZA
+define i64 @zt0_new_caller_abi_routine_callee() "aarch64_new_zt0" nounwind {
+; CHECK-LABEL: zt0_new_caller_abi_routine_callee:
+; CHECK:       // %bb.0: // %prelude
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    cbz x8, .LBB7_2
+; CHECK-NEXT:  // %bb.1: // %save.za
+; CHECK-NEXT:    bl __arm_tpidr2_save
+; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:  .LBB7_2:
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    zero { zt0 }
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    str zt0, [x19]
+; CHECK-NEXT:    bl __arm_sme_state
+; CHECK-NEXT:    ldr zt0, [x19]
+; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    ret
+  %res = call {i64, i64} @__arm_sme_state()
+  %res.0 = extractvalue {i64, i64} %res, 0
+  ret i64 %res.0
+}
+
+declare {i64, i64} @__arm_sme_state()
+
 ;
 ; New-ZA Caller
 ;
@@ -144,23 +210,18 @@ define void @zt0_in_caller_zt0_new_callee() "aarch64_in_zt0" nounwind {
 define void @zt0_new_caller() "aarch64_new_zt0" nounwind {
 ; CHECK-LABEL: zt0_new_caller:
 ; CHECK:       // %bb.0: // %prelude
-; CHECK-NEXT:    sub sp, sp, #80
-; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
-; CHECK-NEXT:    cbz x8, .LBB6_2
+; CHECK-NEXT:    cbz x8, .LBB8_2
 ; CHECK-NEXT:  // %bb.1: // %save.za
-; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    str zt0, [x8]
 ; CHECK-NEXT:    bl __arm_tpidr2_save
-; CHECK-NEXT:    ldr zt0, [x8]
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
-; CHECK-NEXT:  .LBB6_2:
+; CHECK-NEXT:  .LBB8_2:
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    zero { zt0 }
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstop za
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void @callee() "aarch64_in_zt0";
   ret void;
@@ -172,24 +233,19 @@ define void @zt0_new_caller() "aarch64_new_zt0" nounwind {
 define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind {
 ; CHECK-LABEL: new_za_zt0_caller:
 ; CHECK:       // %bb.0: // %prelude
-; CHECK-NEXT:    sub sp, sp, #80
-; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
-; CHECK-NEXT:    cbz x8, .LBB7_2
+; CHECK-NEXT:    cbz x8, .LBB9_2
 ; CHECK-NEXT:  // %bb.1: // %save.za
-; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    str zt0, [x8]
 ; CHECK-NEXT:    bl __arm_tpidr2_save
-; CHECK-NEXT:    ldr zt0, [x8]
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
-; CHECK-NEXT:  .LBB7_2:
+; CHECK-NEXT:  .LBB9_2:
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    zero {za}
 ; CHECK-NEXT:    zero { zt0 }
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstop za
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void @callee() "aarch64_inout_za" "aarch64_in_zt0";
   ret void;
diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll
index a4c2b30566a95..fcedcb8e24222 100644
--- a/llvm/test/CodeGen/AArch64/stack-hazard.ll
+++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll
@@ -2911,12 +2911,13 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK64-NEXT:    mov x9, sp
 ; CHECK64-NEXT:    mov w20, w0
 ; CHECK64-NEXT:    msub x9, x8, x8, x9
+; CHECK64-NEXT:    mov x19, sp
 ; CHECK64-NEXT:    mov sp, x9
-; CHECK64-NEXT:    stur x9, [x29, #-208]
-; CHECK64-NEXT:    sub x9, x29, #208
-; CHECK64-NEXT:    sturh wzr, [x29, #-198]
-; CHECK64-NEXT:    stur wzr, [x29, #-196]
-; CHECK64-NEXT:    sturh w8, [x29, #-200]
+; CHECK64-NEXT:    str x9, [x19]
+; CHECK64-NEXT:    add x9, x19, #0
+; CHECK64-NEXT:    strh wzr, [x19, #10]
+; CHECK64-NEXT:    str wzr, [x19, #12]
+; CHECK64-NEXT:    strh w8, [x19, #8]
 ; CHECK64-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK64-NEXT:    .cfi_offset vg, -32
 ; CHECK64-NEXT:    smstop sm
@@ -2925,7 +2926,7 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK64-NEXT:    .cfi_restore vg
 ; CHECK64-NEXT:    smstart za
 ; CHECK64-NEXT:    mrs x8, TPIDR2_EL0
-; CHECK64-NEXT:    sub x0, x29, #208
+; CHECK64-NEXT:    add x0, x19, #0
 ; CHECK64-NEXT:    cbnz x8, .LBB33_2
 ; CHECK64-NEXT:  // %bb.1: // %entry
 ; CHECK64-NEXT:    bl __arm_tpidr2_restore
@@ -2991,16 +2992,13 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK1024-NEXT:    mov x9, sp
 ; CHECK1024-NEXT:    mov w20, w0
 ; CHECK1024-NEXT:    msub x9, x8, x8, x9
+; CHECK1024-NEXT:    mov x19, sp
 ; CHECK1024-NEXT:    mov sp, x9
-; CHECK1024-NEXT:    sub x10, x29, #1872
-; CHECK1024-NEXT:    stur x9, [x10, #-256]
-; CHECK1024-NEXT:    sub x9, x29, #1862
-; CHECK1024-NEXT:    sub x10, x29, #1860
-; CHECK1024-NEXT:    sturh wzr, [x9, #-256]
-; CHECK1024-NEXT:    sub x9, x29, #2128
-; CHECK1024-NEXT:    stur wzr, [x10, #-256]
-; CHECK1024-NEXT:    sub x10, x29, #1864
-; CHECK1024-NEXT:    sturh w8, [x10, #-256]
+; CHECK1024-NEXT:    str x9, [x19]
+; CHECK1024-NEXT:    add x9, x19, #0
+; CHECK1024-NEXT:    strh wzr, [x19, #10]
+; CHECK1024-NEXT:    str wzr, [x19, #12]
+; CHECK1024-NEXT:    strh w8, [x19, #8]
 ; CHECK1024-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK1024-NEXT:    .cfi_offset vg, -32
 ; CHECK1024-NEXT:    smstop sm
@@ -3009,7 +3007,7 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK1024-NEXT:    .cfi_restore vg
 ; CHECK1024-NEXT:    smstart za
 ; CHECK1024-NEXT:    mrs x8, TPIDR2_EL0
-; CHECK1024-NEXT:    sub x0, x29, #2128
+; CHECK1024-NEXT:    add x0, x19, #0
 ; CHECK1024-NEXT:    cbnz x8, .LBB33_2
 ; CHECK1024-NEXT:  // %bb.1: // %entry
 ; CHECK1024-NEXT:    bl __arm_tpidr2_restore
@@ -3154,3 +3152,1176 @@ entry:
   call void @bar(ptr noundef nonnull %b)
   ret i32 0
 }
+
+
+define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" {
+; CHECK0-LABEL: svecc_call_dynamic_alloca:
+; CHECK0:       // %bb.0: // %entry
+; CHECK0-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK0-NEXT:    .cfi_def_cfa_offset 64
+; CHECK0-NEXT:    cntd x9
+; CHECK0-NEXT:    stp x27, x26, [sp, #32] // 16-byte Folded Spill
+; CHECK0-NEXT:    stp x9, x28, [sp, #16] // 16-byte Folded Spill
+; CHECK0-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK0-NEXT:    mov x29, sp
+; CHECK0-NEXT:    .cfi_def_cfa w29, 64
+; CHECK0-NEXT:    .cfi_offset w19, -8
+; CHECK0-NEXT:    .cfi_offset w20, -16
+; CHECK0-NEXT:    .cfi_offset w26, -24
+; CHECK0-NEXT:    .cfi_offset w27, -32
+; CHECK0-NEXT:    .cfi_offset w28, -40
+; CHECK0-NEXT:    .cfi_offset w30, -56
+; CHECK0-NEXT:    .cfi_offset w29, -64
+; CHECK0-NEXT:    addvl sp, sp, #-18
+; CHECK0-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 64 - 8 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 64 - 16 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 64 - 24 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 64 - 32 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 64 - 40 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 64 - 48 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 64 - 56 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 64 - 64 * VG
+; CHECK0-NEXT:    mov w9, w0
+; CHECK0-NEXT:    mov x8, sp
+; CHECK0-NEXT:    mov w2, w1
+; CHECK0-NEXT:    add x9, x9, #15
+; CHECK0-NEXT:    mov x19, sp
+; CHECK0-NEXT:    and x9, x9, #0x1fffffff0
+; CHECK0-NEXT:    sub x8, x8, x9
+; CHECK0-NEXT:    mov sp, x8
+; CHECK0-NEXT:    //APP
+; CHECK0-NEXT:    //NO_APP
+; CHECK0-NEXT:    bl __arm_sme_state
+; CHECK0-NEXT:    and x20, x0, #0x1
+; CHECK0-NEXT:    .cfi_offset vg, -48
+; CHECK0-NEXT:    tbz w20, #0, .LBB35_2
+; CHECK0-NEXT:  // %bb.1: // %entry
+; CHECK0-NEXT:    smstop sm
+; CHECK0-NEXT:  .LBB35_2: // %entry
+; CHECK0-NEXT:    mov x0, x8
+; CHECK0-NEXT:    mov w1, #45 // =0x2d
+; CHECK0-NEXT:    bl memset
+; CHECK0-NEXT:    tbz w20, #0, .LBB35_4
+; CHECK0-NEXT:  // %bb.3: // %entry
+; CHECK0-NEXT:    smstart sm
+; CHECK0-NEXT:  .LBB35_4: // %entry
+; CHECK0-NEXT:    mov w0, #22647 // =0x5877
+; CHECK0-NEXT:    movk w0, #59491, lsl #16
+; CHECK0-NEXT:    .cfi_restore vg
+; CHECK0-NEXT:    addvl sp, x29, #-18
+; CHECK0-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    .cfi_restore z8
+; CHECK0-NEXT:    .cfi_restore z9
+; CHECK0-NEXT:    .cfi_restore z10
+; CHECK0-NEXT:    .cfi_restore z11
+; CHECK0-NEXT:    .cfi_restore z12
+; CHECK0-NEXT:    .cfi_restore z13
+; CHECK0-NEXT:    .cfi_restore z14
+; CHECK0-NEXT:    .cfi_restore z15
+; CHECK0-NEXT:    mov sp, x29
+; CHECK0-NEXT:    .cfi_def_cfa wsp, 64
+; CHECK0-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr x28, [sp, #24] // 8-byte Folded Reload
+; CHECK0-NEXT:    ldp x27, x26, [sp, #32] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK0-NEXT:    .cfi_def_cfa_offset 0
+; CHECK0-NEXT:    .cfi_restore w19
+; CHECK0-NEXT:    .cfi_restore w20
+; CHECK0-NEXT:    .cfi_restore w26
+; CHECK0-NEXT:    .cfi_restore w27
+; CHECK0-NEXT:    .cfi_restore w28
+; CHECK0-NEXT:    .cfi_restore w30
+; CHECK0-NEXT:    .cfi_restore w29
+; CHECK0-NEXT:    ret
+;
+; CHECK64-LABEL: svecc_call_dynamic_alloca:
+; CHECK64:       // %bb.0: // %entry
+; CHECK64-NEXT:    sub sp, sp, #128
+; CHECK64-NEXT:    .cfi_def_cfa_offset 128
+; CHECK64-NEXT:    cntd x9
+; CHECK64-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x9, x28, [sp, #80] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x27, x26, [sp, #96] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x20, x19, [sp, #112] // 16-byte Folded Spill
+; CHECK64-NEXT:    add x29, sp, #64
+; CHECK64-NEXT:    .cfi_def_cfa w29, 64
+; CHECK64-NEXT:    .cfi_offset w19, -8
+; CHECK64-NEXT:    .cfi_offset w20, -16
+; CHECK64-NEXT:    .cfi_offset w26, -24
+; CHECK64-NEXT:    .cfi_offset w27, -32
+; CHECK64-NEXT:    .cfi_offset w28, -40
+; CHECK64-NEXT:    .cfi_offset w30, -56
+; CHECK64-NEXT:    .cfi_offset w29, -64
+; CHECK64-NEXT:    addvl sp, sp, #-18
+; CHECK64-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 128 - 8 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 128 - 16 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 128 - 24 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 128 - 32 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 128 - 40 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 128 - 48 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 128 - 56 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 128 - 64 * VG
+; CHECK64-NEXT:    sub sp, sp, #64
+; CHECK64-NEXT:    mov w9, w0
+; CHECK64-NEXT:    mov x8, sp
+; CHECK64-NEXT:    mov w2, w1
+; CHECK64-NEXT:    add x9, x9, #15
+; CHECK64-NEXT:    mov x19, sp
+; CHECK64-NEXT:    and x9, x9, #0x1fffffff0
+; CHECK64-NEXT:    sub x8, x8, x9
+; CHECK64-NEXT:    mov sp, x8
+; CHECK64-NEXT:    //APP
+; CHECK64-NEXT:    //NO_APP
+; CHECK64-NEXT:    bl __arm_sme_state
+; CHECK64-NEXT:    and x20, x0, #0x1
+; CHECK64-NEXT:    .cfi_offset vg, -48
+; CHECK64-NEXT:    tbz w20, #0, .LBB35_2
+; CHECK64-NEXT:  // %bb.1: // %entry
+; CHECK64-NEXT:    smstop sm
+; CHECK64-NEXT:  .LBB35_2: // %entry
+; CHECK64-NEXT:    mov x0, x8
+; CHECK64-NEXT:    mov w1, #45 // =0x2d
+; CHECK64-NEXT:    bl memset
+; CHECK64-NEXT:    tbz w20, #0, .LBB35_4
+; CHECK64-NEXT:  // %bb.3: // %entry
+; CHECK64-NEXT:    smstart sm
+; CHECK64-NEXT:  .LBB35_4: // %entry
+; CHECK64-NEXT:    mov w0, #22647 // =0x5877
+; CHECK64-NEXT:    movk w0, #59491, lsl #16
+; CHECK64-NEXT:    .cfi_restore vg
+; CHECK64-NEXT:    sub x8, x29, #64
+; CHECK64-NEXT:    addvl sp, x8, #-18
+; CHECK64-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    .cfi_restore z8
+; CHECK64-NEXT:    .cfi_restore z9
+; CHECK64-NEXT:    .cfi_restore z10
+; CHECK64-NEXT:    .cfi_restore z11
+; CHECK64-NEXT:    .cfi_restore z12
+; CHECK64-NEXT:    .cfi_restore z13
+; CHECK64-NEXT:    .cfi_restore z14
+; CHECK64-NEXT:    .cfi_restore z15
+; CHECK64-NEXT:    sub sp, x29, #64
+; CHECK64-NEXT:    .cfi_def_cfa wsp, 128
+; CHECK64-NEXT:    ldp x20, x19, [sp, #112] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr x28, [sp, #88] // 8-byte Folded Reload
+; CHECK64-NEXT:    ldp x27, x26, [sp, #96] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK64-NEXT:    add sp, sp, #128
+; CHECK64-NEXT:    .cfi_def_cfa_offset 0
+; CHECK64-NEXT:    .cfi_restore w19
+; CHECK64-NEXT:    .cfi_restore w20
+; CHECK64-NEXT:    .cfi_restore w26
+; CHECK64-NEXT:    .cfi_restore w27
+; CHECK64-NEXT:    .cfi_restore w28
+; CHECK64-NEXT:    .cfi_restore w30
+; CHECK64-NEXT:    .cfi_restore w29
+; CHECK64-NEXT:    ret
+;
+; CHECK1024-LABEL: svecc_call_dynamic_alloca:
+; CHECK1024:       // %bb.0: // %entry
+; CHECK1024-NEXT:    sub sp, sp, #1088
+; CHECK1024-NEXT:    .cfi_def_cfa_offset 1088
+; CHECK1024-NEXT:    cntd x9
+; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x9, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x28, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x27, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x26, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x20, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x19, [sp, #1080] // 8-byte Folded Spill
+; CHECK1024-NEXT:    add x29, sp, #1024
+; CHECK1024-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-NEXT:    .cfi_offset w19, -8
+; CHECK1024-NEXT:    .cfi_offset w20, -16
+; CHECK1024-NEXT:    .cfi_offset w26, -24
+; CHECK1024-NEXT:    .cfi_offset w27, -32
+; CHECK1024-NEXT:    .cfi_offset w28, -40
+; CHECK1024-NEXT:    .cfi_offset w30, -56
+; CHECK1024-NEXT:    .cfi_offset w29, -64
+; CHECK1024-NEXT:    addvl sp, sp, #-18
+; CHECK1024-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1088 - 8 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 1088 - 16 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 1088 - 24 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 1088 - 32 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 1088 - 40 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 1088 - 48 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 1088 - 56 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 1088 - 64 * VG
+; CHECK1024-NEXT:    sub sp, sp, #1024
+; CHECK1024-NEXT:    mov w9, w0
+; CHECK1024-NEXT:    mov x8, sp
+; CHECK1024-NEXT:    mov w2, w1
+; CHECK1024-NEXT:    add x9, x9, #15
+; CHECK1024-NEXT:    mov x19, sp
+; CHECK1024-NEXT:    and x9, x9, #0x1fffffff0
+; CHECK1024-NEXT:    sub x8, x8, x9
+; CHECK1024-NEXT:    mov sp, x8
+; CHECK1024-NEXT:    //APP
+; CHECK1024-NEXT:    //NO_APP
+; CHECK1024-NEXT:    bl __arm_sme_state
+; CHECK1024-NEXT:    and x20, x0, #0x1
+; CHECK1024-NEXT:    .cfi_offset vg, -48
+; CHECK1024-NEXT:    tbz w20, #0, .LBB35_2
+; CHECK1024-NEXT:  // %bb.1: // %entry
+; CHECK1024-NEXT:    smstop sm
+; CHECK1024-NEXT:  .LBB35_2: // %entry
+; CHECK1024-NEXT:    mov x0, x8
+; CHECK1024-NEXT:    mov w1, #45 // =0x2d
+; CHECK1024-NEXT:    bl memset
+; CHECK1024-NEXT:    tbz w20, #0, .LBB35_4
+; CHECK1024-NEXT:  // %bb.3: // %entry
+; CHECK1024-NEXT:    smstart sm
+; CHECK1024-NEXT:  .LBB35_4: // %entry
+; CHECK1024-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-NEXT:    .cfi_restore vg
+; CHECK1024-NEXT:    sub x8, x29, #1024
+; CHECK1024-NEXT:    addvl sp, x8, #-18
+; CHECK1024-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    .cfi_restore z8
+; CHECK1024-NEXT:    .cfi_restore z9
+; CHECK1024-NEXT:    .cfi_restore z10
+; CHECK1024-NEXT:    .cfi_restore z11
+; CHECK1024-NEXT:    .cfi_restore z12
+; CHECK1024-NEXT:    .cfi_restore z13
+; CHECK1024-NEXT:    .cfi_restore z14
+; CHECK1024-NEXT:    .cfi_restore z15
+; CHECK1024-NEXT:    sub sp, x29, #1024
+; CHECK1024-NEXT:    .cfi_def_cfa wsp, 1088
+; CHECK1024-NEXT:    ldr x19, [sp, #1080] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x20, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x26, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x27, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x28, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NEXT:    add sp, sp, #1088
+; CHECK1024-NEXT:    .cfi_def_cfa_offset 0
+; CHECK1024-NEXT:    .cfi_restore w19
+; CHECK1024-NEXT:    .cfi_restore w20
+; CHECK1024-NEXT:    .cfi_restore w26
+; CHECK1024-NEXT:    .cfi_restore w27
+; CHECK1024-NEXT:    .cfi_restore w28
+; CHECK1024-NEXT:    .cfi_restore w30
+; CHECK1024-NEXT:    .cfi_restore w29
+; CHECK1024-NEXT:    ret
+entry:
+  %ptr = alloca i8, i32 %P1
+  tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
+  %call = call ptr @memset(ptr noundef nonnull %ptr, i32 noundef 45, i32 noundef %P2)
+  ret i32 -396142473
+}
+
+
+define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" {
+; CHECK0-LABEL: svecc_call_realign:
+; CHECK0:       // %bb.0: // %entry
+; CHECK0-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK0-NEXT:    .cfi_def_cfa_offset 64
+; CHECK0-NEXT:    cntd x9
+; CHECK0-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK0-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK0-NEXT:    stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK0-NEXT:    mov x29, sp
+; CHECK0-NEXT:    .cfi_def_cfa w29, 64
+; CHECK0-NEXT:    .cfi_offset w19, -8
+; CHECK0-NEXT:    .cfi_offset w26, -16
+; CHECK0-NEXT:    .cfi_offset w27, -24
+; CHECK0-NEXT:    .cfi_offset w28, -32
+; CHECK0-NEXT:    .cfi_offset w30, -56
+; CHECK0-NEXT:    .cfi_offset w29, -64
+; CHECK0-NEXT:    addvl sp, sp, #-18
+; CHECK0-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 64 - 8 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 64 - 16 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 64 - 24 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 64 - 32 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 64 - 40 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 64 - 48 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 64 - 56 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 64 - 64 * VG
+; CHECK0-NEXT:    sub x9, sp, #1024
+; CHECK0-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK0-NEXT:    mov w2, w1
+; CHECK0-NEXT:    //APP
+; CHECK0-NEXT:    //NO_APP
+; CHECK0-NEXT:    bl __arm_sme_state
+; CHECK0-NEXT:    and x19, x0, #0x1
+; CHECK0-NEXT:    .cfi_offset vg, -48
+; CHECK0-NEXT:    tbz w19, #0, .LBB36_2
+; CHECK0-NEXT:  // %bb.1: // %entry
+; CHECK0-NEXT:    smstop sm
+; CHECK0-NEXT:  .LBB36_2: // %entry
+; CHECK0-NEXT:    mov x0, sp
+; CHECK0-NEXT:    mov w1, #45 // =0x2d
+; CHECK0-NEXT:    bl memset
+; CHECK0-NEXT:    tbz w19, #0, .LBB36_4
+; CHECK0-NEXT:  // %bb.3: // %entry
+; CHECK0-NEXT:    smstart sm
+; CHECK0-NEXT:  .LBB36_4: // %entry
+; CHECK0-NEXT:    mov w0, #22647 // =0x5877
+; CHECK0-NEXT:    movk w0, #59491, lsl #16
+; CHECK0-NEXT:    .cfi_restore vg
+; CHECK0-NEXT:    addvl sp, x29, #-18
+; CHECK0-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    .cfi_restore z8
+; CHECK0-NEXT:    .cfi_restore z9
+; CHECK0-NEXT:    .cfi_restore z10
+; CHECK0-NEXT:    .cfi_restore z11
+; CHECK0-NEXT:    .cfi_restore z12
+; CHECK0-NEXT:    .cfi_restore z13
+; CHECK0-NEXT:    .cfi_restore z14
+; CHECK0-NEXT:    .cfi_restore z15
+; CHECK0-NEXT:    mov sp, x29
+; CHECK0-NEXT:    .cfi_def_cfa wsp, 64
+; CHECK0-NEXT:    ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK0-NEXT:    .cfi_def_cfa_offset 0
+; CHECK0-NEXT:    .cfi_restore w19
+; CHECK0-NEXT:    .cfi_restore w26
+; CHECK0-NEXT:    .cfi_restore w27
+; CHECK0-NEXT:    .cfi_restore w28
+; CHECK0-NEXT:    .cfi_restore w30
+; CHECK0-NEXT:    .cfi_restore w29
+; CHECK0-NEXT:    ret
+;
+; CHECK64-LABEL: svecc_call_realign:
+; CHECK64:       // %bb.0: // %entry
+; CHECK64-NEXT:    sub sp, sp, #128
+; CHECK64-NEXT:    .cfi_def_cfa_offset 128
+; CHECK64-NEXT:    cntd x9
+; CHECK64-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x9, x28, [sp, #80] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x27, x26, [sp, #96] // 16-byte Folded Spill
+; CHECK64-NEXT:    str x19, [sp, #112] // 8-byte Folded Spill
+; CHECK64-NEXT:    add x29, sp, #64
+; CHECK64-NEXT:    .cfi_def_cfa w29, 64
+; CHECK64-NEXT:    .cfi_offset w19, -16
+; CHECK64-NEXT:    .cfi_offset w26, -24
+; CHECK64-NEXT:    .cfi_offset w27, -32
+; CHECK64-NEXT:    .cfi_offset w28, -40
+; CHECK64-NEXT:    .cfi_offset w30, -56
+; CHECK64-NEXT:    .cfi_offset w29, -64
+; CHECK64-NEXT:    addvl sp, sp, #-18
+; CHECK64-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 128 - 8 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 128 - 16 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 128 - 24 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 128 - 32 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 128 - 40 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 128 - 48 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 128 - 56 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 128 - 64 * VG
+; CHECK64-NEXT:    sub x9, sp, #1088
+; CHECK64-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK64-NEXT:    mov w2, w1
+; CHECK64-NEXT:    //APP
+; CHECK64-NEXT:    //NO_APP
+; CHECK64-NEXT:    bl __arm_sme_state
+; CHECK64-NEXT:    and x19, x0, #0x1
+; CHECK64-NEXT:    .cfi_offset vg, -48
+; CHECK64-NEXT:    tbz w19, #0, .LBB36_2
+; CHECK64-NEXT:  // %bb.1: // %entry
+; CHECK64-NEXT:    smstop sm
+; CHECK64-NEXT:  .LBB36_2: // %entry
+; CHECK64-NEXT:    mov x0, sp
+; CHECK64-NEXT:    mov w1, #45 // =0x2d
+; CHECK64-NEXT:    bl memset
+; CHECK64-NEXT:    tbz w19, #0, .LBB36_4
+; CHECK64-NEXT:  // %bb.3: // %entry
+; CHECK64-NEXT:    smstart sm
+; CHECK64-NEXT:  .LBB36_4: // %entry
+; CHECK64-NEXT:    mov w0, #22647 // =0x5877
+; CHECK64-NEXT:    movk w0, #59491, lsl #16
+; CHECK64-NEXT:    .cfi_restore vg
+; CHECK64-NEXT:    sub x8, x29, #64
+; CHECK64-NEXT:    addvl sp, x8, #-18
+; CHECK64-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    .cfi_restore z8
+; CHECK64-NEXT:    .cfi_restore z9
+; CHECK64-NEXT:    .cfi_restore z10
+; CHECK64-NEXT:    .cfi_restore z11
+; CHECK64-NEXT:    .cfi_restore z12
+; CHECK64-NEXT:    .cfi_restore z13
+; CHECK64-NEXT:    .cfi_restore z14
+; CHECK64-NEXT:    .cfi_restore z15
+; CHECK64-NEXT:    sub sp, x29, #64
+; CHECK64-NEXT:    .cfi_def_cfa wsp, 128
+; CHECK64-NEXT:    ldp x26, x19, [sp, #104] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x28, x27, [sp, #88] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK64-NEXT:    add sp, sp, #128
+; CHECK64-NEXT:    .cfi_def_cfa_offset 0
+; CHECK64-NEXT:    .cfi_restore w19
+; CHECK64-NEXT:    .cfi_restore w26
+; CHECK64-NEXT:    .cfi_restore w27
+; CHECK64-NEXT:    .cfi_restore w28
+; CHECK64-NEXT:    .cfi_restore w30
+; CHECK64-NEXT:    .cfi_restore w29
+; CHECK64-NEXT:    ret
+;
+; CHECK1024-LABEL: svecc_call_realign:
+; CHECK1024:       // %bb.0: // %entry
+; CHECK1024-NEXT:    sub sp, sp, #1088
+; CHECK1024-NEXT:    .cfi_def_cfa_offset 1088
+; CHECK1024-NEXT:    cntd x9
+; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x9, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x28, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x27, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x26, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x19, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NEXT:    add x29, sp, #1024
+; CHECK1024-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-NEXT:    .cfi_offset w19, -16
+; CHECK1024-NEXT:    .cfi_offset w26, -24
+; CHECK1024-NEXT:    .cfi_offset w27, -32
+; CHECK1024-NEXT:    .cfi_offset w28, -40
+; CHECK1024-NEXT:    .cfi_offset w30, -56
+; CHECK1024-NEXT:    .cfi_offset w29, -64
+; CHECK1024-NEXT:    addvl sp, sp, #-18
+; CHECK1024-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1088 - 8 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 1088 - 16 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 1088 - 24 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 1088 - 32 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 1088 - 40 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 1088 - 48 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 1088 - 56 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 1088 - 64 * VG
+; CHECK1024-NEXT:    sub x9, sp, #2048
+; CHECK1024-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK1024-NEXT:    mov w2, w1
+; CHECK1024-NEXT:    //APP
+; CHECK1024-NEXT:    //NO_APP
+; CHECK1024-NEXT:    bl __arm_sme_state
+; CHECK1024-NEXT:    and x19, x0, #0x1
+; CHECK1024-NEXT:    .cfi_offset vg, -48
+; CHECK1024-NEXT:    tbz w19, #0, .LBB36_2
+; CHECK1024-NEXT:  // %bb.1: // %entry
+; CHECK1024-NEXT:    smstop sm
+; CHECK1024-NEXT:  .LBB36_2: // %entry
+; CHECK1024-NEXT:    mov x0, sp
+; CHECK1024-NEXT:    mov w1, #45 // =0x2d
+; CHECK1024-NEXT:    bl memset
+; CHECK1024-NEXT:    tbz w19, #0, .LBB36_4
+; CHECK1024-NEXT:  // %bb.3: // %entry
+; CHECK1024-NEXT:    smstart sm
+; CHECK1024-NEXT:  .LBB36_4: // %entry
+; CHECK1024-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-NEXT:    .cfi_restore vg
+; CHECK1024-NEXT:    sub x8, x29, #1024
+; CHECK1024-NEXT:    addvl sp, x8, #-18
+; CHECK1024-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    .cfi_restore z8
+; CHECK1024-NEXT:    .cfi_restore z9
+; CHECK1024-NEXT:    .cfi_restore z10
+; CHECK1024-NEXT:    .cfi_restore z11
+; CHECK1024-NEXT:    .cfi_restore z12
+; CHECK1024-NEXT:    .cfi_restore z13
+; CHECK1024-NEXT:    .cfi_restore z14
+; CHECK1024-NEXT:    .cfi_restore z15
+; CHECK1024-NEXT:    sub sp, x29, #1024
+; CHECK1024-NEXT:    .cfi_def_cfa wsp, 1088
+; CHECK1024-NEXT:    ldr x19, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x26, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x27, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x28, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NEXT:    add sp, sp, #1088
+; CHECK1024-NEXT:    .cfi_def_cfa_offset 0
+; CHECK1024-NEXT:    .cfi_restore w19
+; CHECK1024-NEXT:    .cfi_restore w26
+; CHECK1024-NEXT:    .cfi_restore w27
+; CHECK1024-NEXT:    .cfi_restore w28
+; CHECK1024-NEXT:    .cfi_restore w30
+; CHECK1024-NEXT:    .cfi_restore w29
+; CHECK1024-NEXT:    ret
+entry:
+  %ptr = alloca i8, i32 1000, align 32
+  tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
+  %call = call ptr @memset(ptr noundef nonnull %ptr, i32 noundef 45, i32 noundef %P2)
+  ret i32 -396142473
+}
+
+
+define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" {
+; CHECK0-LABEL: svecc_call_dynamic_and_scalable_alloca:
+; CHECK0:       // %bb.0: // %entry
+; CHECK0-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK0-NEXT:    str x28, [sp, #16] // 8-byte Folded Spill
+; CHECK0-NEXT:    mov x29, sp
+; CHECK0-NEXT:    stp x27, x26, [sp, #32] // 16-byte Folded Spill
+; CHECK0-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK0-NEXT:    addvl sp, sp, #-18
+; CHECK0-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    sub sp, sp, #48
+; CHECK0-NEXT:    addvl sp, sp, #-1
+; CHECK0-NEXT:    mov x19, sp
+; CHECK0-NEXT:    .cfi_def_cfa w29, 64
+; CHECK0-NEXT:    .cfi_offset w19, -8
+; CHECK0-NEXT:    .cfi_offset w20, -16
+; CHECK0-NEXT:    .cfi_offset w26, -24
+; CHECK0-NEXT:    .cfi_offset w27, -32
+; CHECK0-NEXT:    .cfi_offset w28, -48
+; CHECK0-NEXT:    .cfi_offset w30, -56
+; CHECK0-NEXT:    .cfi_offset w29, -64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 64 - 8 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 64 - 16 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 64 - 24 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 64 - 32 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 64 - 40 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 64 - 48 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 64 - 56 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 64 - 64 * VG
+; CHECK0-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK0-NEXT:    ubfiz x8, x0, #2, #32
+; CHECK0-NEXT:    mov x9, sp
+; CHECK0-NEXT:    add x8, x8, #15
+; CHECK0-NEXT:    and x8, x8, #0x7fffffff0
+; CHECK0-NEXT:    sub x20, x9, x8
+; CHECK0-NEXT:    mov sp, x20
+; CHECK0-NEXT:    //APP
+; CHECK0-NEXT:    //NO_APP
+; CHECK0-NEXT:    add x0, x19, #8
+; CHECK0-NEXT:    bl bar
+; CHECK0-NEXT:    addvl x0, x29, #-19
+; CHECK0-NEXT:    bl bar
+; CHECK0-NEXT:    mov x0, x20
+; CHECK0-NEXT:    bl bar
+; CHECK0-NEXT:    mov w0, #22647 // =0x5877
+; CHECK0-NEXT:    movk w0, #59491, lsl #16
+; CHECK0-NEXT:    addvl sp, x29, #-18
+; CHECK0-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    mov sp, x29
+; CHECK0-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr x28, [sp, #16] // 8-byte Folded Reload
+; CHECK0-NEXT:    ldp x27, x26, [sp, #32] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK0-NEXT:    ret
+;
+; CHECK64-LABEL: svecc_call_dynamic_and_scalable_alloca:
+; CHECK64:       // %bb.0: // %entry
+; CHECK64-NEXT:    sub sp, sp, #128
+; CHECK64-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK64-NEXT:    add x29, sp, #64
+; CHECK64-NEXT:    stp x28, x27, [sp, #80] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x26, x20, [sp, #96] // 16-byte Folded Spill
+; CHECK64-NEXT:    str x19, [sp, #112] // 8-byte Folded Spill
+; CHECK64-NEXT:    addvl sp, sp, #-18
+; CHECK64-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    sub sp, sp, #112
+; CHECK64-NEXT:    addvl sp, sp, #-1
+; CHECK64-NEXT:    mov x19, sp
+; CHECK64-NEXT:    .cfi_def_cfa w29, 64
+; CHECK64-NEXT:    .cfi_offset w19, -16
+; CHECK64-NEXT:    .cfi_offset w20, -24
+; CHECK64-NEXT:    .cfi_offset w26, -32
+; CHECK64-NEXT:    .cfi_offset w27, -40
+; CHECK64-NEXT:    .cfi_offset w28, -48
+; CHECK64-NEXT:    .cfi_offset w30, -56
+; CHECK64-NEXT:    .cfi_offset w29, -64
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 128 - 8 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 128 - 16 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 128 - 24 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 128 - 32 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 128 - 40 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 128 - 48 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 128 - 56 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 128 - 64 * VG
+; CHECK64-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK64-NEXT:    ubfiz x8, x0, #2, #32
+; CHECK64-NEXT:    mov x9, sp
+; CHECK64-NEXT:    add x8, x8, #15
+; CHECK64-NEXT:    and x8, x8, #0x7fffffff0
+; CHECK64-NEXT:    sub x20, x9, x8
+; CHECK64-NEXT:    mov sp, x20
+; CHECK64-NEXT:    //APP
+; CHECK64-NEXT:    //NO_APP
+; CHECK64-NEXT:    add x0, x19, #8
+; CHECK64-NEXT:    bl bar
+; CHECK64-NEXT:    sub x0, x29, #64
+; CHECK64-NEXT:    addvl x0, x0, #-19
+; CHECK64-NEXT:    bl bar
+; CHECK64-NEXT:    mov x0, x20
+; CHECK64-NEXT:    bl bar
+; CHECK64-NEXT:    mov w0, #22647 // =0x5877
+; CHECK64-NEXT:    sub x8, x29, #64
+; CHECK64-NEXT:    movk w0, #59491, lsl #16
+; CHECK64-NEXT:    addvl sp, x8, #-18
+; CHECK64-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    sub sp, x29, #64
+; CHECK64-NEXT:    ldp x20, x19, [sp, #104] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK64-NEXT:    ldp x27, x26, [sp, #88] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x30, x28, [sp, #72] // 16-byte Folded Reload
+; CHECK64-NEXT:    add sp, sp, #128
+; CHECK64-NEXT:    ret
+;
+; CHECK1024-LABEL: svecc_call_dynamic_and_scalable_alloca:
+; CHECK1024:       // %bb.0: // %entry
+; CHECK1024-NEXT:    sub sp, sp, #1088
+; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NEXT:    add x29, sp, #1024
+; CHECK1024-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x28, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x27, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x26, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x20, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x19, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NEXT:    addvl sp, sp, #-18
+; CHECK1024-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    sub sp, sp, #1072
+; CHECK1024-NEXT:    addvl sp, sp, #-1
+; CHECK1024-NEXT:    mov x19, sp
+; CHECK1024-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-NEXT:    .cfi_offset w19, -16
+; CHECK1024-NEXT:    .cfi_offset w20, -24
+; CHECK1024-NEXT:    .cfi_offset w26, -32
+; CHECK1024-NEXT:    .cfi_offset w27, -40
+; CHECK1024-NEXT:    .cfi_offset w28, -48
+; CHECK1024-NEXT:    .cfi_offset w30, -56
+; CHECK1024-NEXT:    .cfi_offset w29, -64
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1088 - 8 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 1088 - 16 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 1088 - 24 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 1088 - 32 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 1088 - 40 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 1088 - 48 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 1088 - 56 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 1088 - 64 * VG
+; CHECK1024-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK1024-NEXT:    ubfiz x8, x0, #2, #32
+; CHECK1024-NEXT:    mov x9, sp
+; CHECK1024-NEXT:    add x8, x8, #15
+; CHECK1024-NEXT:    and x8, x8, #0x7fffffff0
+; CHECK1024-NEXT:    sub x20, x9, x8
+; CHECK1024-NEXT:    mov sp, x20
+; CHECK1024-NEXT:    //APP
+; CHECK1024-NEXT:    //NO_APP
+; CHECK1024-NEXT:    add x0, x19, #8
+; CHECK1024-NEXT:    bl bar
+; CHECK1024-NEXT:    sub x0, x29, #1024
+; CHECK1024-NEXT:    addvl x0, x0, #-19
+; CHECK1024-NEXT:    bl bar
+; CHECK1024-NEXT:    mov x0, x20
+; CHECK1024-NEXT:    bl bar
+; CHECK1024-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-NEXT:    sub x8, x29, #1024
+; CHECK1024-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-NEXT:    addvl sp, x8, #-18
+; CHECK1024-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    sub sp, x29, #1024
+; CHECK1024-NEXT:    ldr x19, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x20, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x26, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x27, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x28, [sp, #1040] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NEXT:    add sp, sp, #1088
+; CHECK1024-NEXT:    ret
+entry:
+  %a = alloca i32, i32 10
+  %b = alloca <vscale x 4 x i32>
+  %c = alloca i32, i32 %P1, align 4
+  tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
+  call void @bar(ptr noundef nonnull %a)
+  call void @bar(ptr noundef nonnull %b)
+  call void @bar(ptr noundef nonnull %c)
+  ret i32 -396142473
+}
diff --git a/llvm/test/CodeGen/AArch64/xar.ll b/llvm/test/CodeGen/AArch64/xar.ll
index d050eaf6646de..5666ab35cde48 100644
--- a/llvm/test/CodeGen/AArch64/xar.ll
+++ b/llvm/test/CodeGen/AArch64/xar.ll
@@ -19,4 +19,24 @@ define <2 x i64> @xar(<2 x i64> %x, <2 x i64> %y) {
     ret <2 x i64> %b
 }
 
+define <1 x i64> @xar_v1i64(<1 x i64> %a, <1 x i64> %b) {
+; SHA3-LABEL: xar_v1i64:
+; SHA3:       // %bb.0:
+; SHA3-NEXT:    // kill: def $d0 killed $d0 def $q0
+; SHA3-NEXT:    // kill: def $d1 killed $d1 def $q1
+; SHA3-NEXT:    xar v0.2d, v0.2d, v1.2d, #63
+; SHA3-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; SHA3-NEXT:    ret
+;
+; NOSHA3-LABEL: xar_v1i64:
+; NOSHA3:       // %bb.0:
+; NOSHA3-NEXT:    eor v1.8b, v0.8b, v1.8b
+; NOSHA3-NEXT:    shl d0, d1, #1
+; NOSHA3-NEXT:    usra d0, d1, #63
+; NOSHA3-NEXT:    ret
+  %v.val = xor <1 x i64> %a, %b
+  %fshl = tail call <1 x i64> @llvm.fshl.v1i64(<1 x i64> %v.val, <1 x i64> %v.val, <1 x i64> splat (i64 1))
+  ret <1 x i64> %fshl
+}
+
 declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
diff --git a/llvm/test/CodeGen/Hexagon/isel/pfalse-v4i1.ll b/llvm/test/CodeGen/Hexagon/isel/pfalse-v4i1.ll
new file mode 100644
index 0000000000000..2c26bb1e310ea
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/isel/pfalse-v4i1.ll
@@ -0,0 +1,30 @@
+; RUN: llc -march=hexagon -debug-only=isel 2>&1 < %s - | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK: [[R0:%[0-9]+]]:intregs = A2_tfrsi 0
+; CHECK-NEXT: predregs = C2_tfrrp killed [[R0]]:intregs
+
+define fastcc i16 @test(ptr %0, { <4 x i32>, <4 x i1> } %1, <4 x i1> %2) {
+Entry:
+  %3 = alloca [16 x i8], i32 0, align 16
+  %4 = alloca [16 x i8], i32 0, align 16
+  store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr %4, align 16
+  store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, ptr %3, align 16
+  %5 = load <4 x i32>, ptr %4, align 16
+  %6 = load <4 x i32>, ptr %3, align 16
+  %7 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> %5, <4 x i32> %6)
+  %8 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %2)
+  br i1 %8, label %OverflowFail, label %OverflowOk
+
+OverflowFail:                                     ; preds = %Entry
+  store volatile i32 0, ptr null, align 4
+    unreachable
+
+OverflowOk:                                       ; preds = %Entry
+  %9 = extractvalue { <4 x i32>, <4 x i1> } %7, 0
+    store <4 x i32> %9, ptr %0, align 16
+      ret i16 0
+      }
+
+declare { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32>, <4 x i32>) #0
+declare i1 @llvm.vector.reduce.or.v4i1(<4 x i1>) #0
diff --git a/llvm/test/CodeGen/LoongArch/O0-pipeline.ll b/llvm/test/CodeGen/LoongArch/O0-pipeline.ll
index 24bd4c75a9821..73d0bda895de0 100644
--- a/llvm/test/CodeGen/LoongArch/O0-pipeline.ll
+++ b/llvm/test/CodeGen/LoongArch/O0-pipeline.ll
@@ -34,15 +34,7 @@
 ; CHECK-NEXT:       Safe Stack instrumentation pass
 ; CHECK-NEXT:       Insert stack protectors
 ; CHECK-NEXT:       Module Verifier
-; CHECK-NEXT:       Dominator Tree Construction
-; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:       Function Alias Analysis Results
-; CHECK-NEXT:       Natural Loop Information
-; CHECK-NEXT:       Post-Dominator Tree Construction
-; CHECK-NEXT:       Branch Probability Analysis
 ; CHECK-NEXT:       Assignment Tracking Analysis
-; CHECK-NEXT:       Lazy Branch Probability Analysis
-; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       LoongArch DAG->DAG Pattern Instruction Selection
 ; CHECK-NEXT:       Finalize ISel and expand pseudo-instructions
 ; CHECK-NEXT:       Local Stack Slot Allocation
diff --git a/llvm/test/CodeGen/LoongArch/csrxchg-intrinsic.ll b/llvm/test/CodeGen/LoongArch/csrxchg-intrinsic.ll
new file mode 100644
index 0000000000000..2f38b3a8c7ad1
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/csrxchg-intrinsic.ll
@@ -0,0 +1,24 @@
+; RUN: llc --mtriple=loongarch32 --mattr=+f --verify-machineinstrs < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+f --verify-machineinstrs < %s | FileCheck %s
+
+declare i32 @llvm.loongarch.csrxchg.w(i32, i32, i32 immarg)
+
+;; Check that the rj operand of csrxchg is not R0.
+define void @csrxchg_w_rj_not_r0(i32 signext %a) {
+; CHECK-NOT:    csrxchg ${{[a-z]*}}, $r0, 0
+; CHECK-NOT:    csrxchg ${{[a-z]*}}, $zero, 0
+entry:
+  %0 = tail call i32 @llvm.loongarch.csrxchg.w(i32 %a, i32 0, i32 0)
+  ret void
+}
+
+;; Check that the rj operand of csrxchg is not R1.
+define i32 @csrxchg_w_rj_not_r1() {
+; CHECK-NOT:    csrxchg ${{[a-z]*}}, $r1, 0
+; CHECK-NOT:    csrxchg ${{[a-z]*}}, $ra, 0
+entry:
+  %0 = tail call i32 asm "", "=r,r,i,{r4},{r5},{r6},{r7},{r8},{r9},{r10},{r11},{r12},{r13},{r14},{r15},{r16},{r17},{r18},{r19},{r20},{r23},{r24},{r25},{r26},{r27},{r28},{r29},{r30},{r31},0"(i32 4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %1 = tail call i32 @llvm.loongarch.csrxchg.w(i32 %0, i32 4, i32 0)
+  %2 = tail call i32 asm "", "=r,r,i,{r4},{r5},{r6},{r7},{r8},{r9},{r10},{r11},{r12},{r13},{r14},{r15},{r16},{r17},{r18},{r19},{r20},{r23},{r24},{r25},{r26},{r27},{r28},{r29},{r30},{r31},0"(i32 4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 %1)
+  ret i32 %2
+}
diff --git a/llvm/test/CodeGen/LoongArch/isel-optnone.ll b/llvm/test/CodeGen/LoongArch/isel-optnone.ll
new file mode 100644
index 0000000000000..4d2528a3148ac
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/isel-optnone.ll
@@ -0,0 +1,10 @@
+; REQUIRES: asserts
+; RUN: llc %s -O0 -mtriple=loongarch64 -o /dev/null -debug-only=isel 2>&1 | FileCheck %s
+
+define void @fooOptnone() #0 {
+; CHECK-NOT: Changing optimization level for Function fooOptnone
+; CHECK-NOT: Restoring optimization level for Function fooOptnone
+  ret void
+}
+
+attributes #0 = { nounwind optnone noinline }
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
new file mode 100644
index 0000000000000..f3bec11810e9b
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+;; Fix https://github.com/llvm/llvm-project/issues/137000.
+
+define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: shufflevector_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 0
+; CHECK-NEXT:    movgr2fr.d $fa2, $a0
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr1, 2
+; CHECK-NEXT:    movgr2fr.d $fa3, $a0
+; CHECK-NEXT:    movfr2gr.d $a0, $fa2
+; CHECK-NEXT:    xvinsgr2vr.d $xr2, $a0, 0
+; CHECK-NEXT:    movfr2gr.d $a0, $fa3
+; CHECK-NEXT:    xvinsgr2vr.d $xr2, $a0, 1
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
+; CHECK-NEXT:    movgr2fr.d $fa0, $a0
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr1, 3
+; CHECK-NEXT:    movgr2fr.d $fa1, $a0
+; CHECK-NEXT:    movfr2gr.d $a0, $fa0
+; CHECK-NEXT:    xvinsgr2vr.d $xr2, $a0, 2
+; CHECK-NEXT:    movfr2gr.d $a0, $fa1
+; CHECK-NEXT:    xvinsgr2vr.d $xr2, $a0, 3
+; CHECK-NEXT:    xvori.b $xr0, $xr2, 0
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 6, i32 3, i32 7>
+  ret <4 x double> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptosi.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptosi.ll
index 0d9f57b57ffae..ed333c303879c 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptosi.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptosi.ll
@@ -31,9 +31,9 @@ define void @fptosi_v4f64_v4i32(ptr %res, ptr %in){
 ; CHECK-LABEL: fptosi_v4f64_v4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvftintrz.l.d $xr0, $xr0
 ; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 238
-; CHECK-NEXT:    xvfcvt.s.d $xr0, $xr1, $xr0
-; CHECK-NEXT:    xvftintrz.w.s $xr0, $xr0
+; CHECK-NEXT:    xvpickev.w $xr0, $xr1, $xr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
   %v0 = load <4 x double>, ptr %in
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptoui.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptoui.ll
index 27d70f33cd34e..9c499ba71d646 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptoui.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptoui.ll
@@ -31,9 +31,9 @@ define void @fptoui_v4f64_v4i32(ptr %res, ptr %in){
 ; CHECK-LABEL: fptoui_v4f64_v4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvftintrz.lu.d $xr0, $xr0
 ; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 238
-; CHECK-NEXT:    xvfcvt.s.d $xr0, $xr1, $xr0
-; CHECK-NEXT:    xvftintrz.w.s $xr0, $xr0
+; CHECK-NEXT:    xvpickev.w $xr0, $xr1, $xr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
   %v0 = load <4 x double>, ptr %in
diff --git a/llvm/test/CodeGen/LoongArch/spill-ra-without-kill.ll b/llvm/test/CodeGen/LoongArch/spill-ra-without-kill.ll
index 08534e307e4e0..c1b1c1f7568bb 100644
--- a/llvm/test/CodeGen/LoongArch/spill-ra-without-kill.ll
+++ b/llvm/test/CodeGen/LoongArch/spill-ra-without-kill.ll
@@ -39,6 +39,7 @@ define dso_local ptr @f(i32 noundef signext %i) "frame-pointer"="all" {
 ; CHECK-NEXT:    b .LBB0_3
 ; CHECK-NEXT:  .LBB0_3: # %if.end
 ; CHECK-NEXT:    ld.d $a0, $fp, -48 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.w $a0, $a0, 0
 ; CHECK-NEXT:    ori $a1, $zero, 1
 ; CHECK-NEXT:    bne $a0, $a1, .LBB0_6
 ; CHECK-NEXT:    b .LBB0_4
diff --git a/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir b/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir
index b2abff75880c9..1030917c87419 100644
--- a/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir
+++ b/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir
@@ -25,8 +25,8 @@ alignment:       4
 tracksRegLiveness: true
 frameInfo:
   maxCallFrameSize: 0
-#CHECK:    frame-setup CFI_INSTRUCTION negate_ra_sign_state
 #CHECK:    frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
+#CHECK:    frame-setup CFI_INSTRUCTION negate_ra_sign_state
 #CHECK:    frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
 body:             |
   bb.0.entry:
@@ -42,8 +42,8 @@ tracksRegLiveness: true
 frameInfo:
   maxCallFrameSize: 0
 #CHECK:    frame-setup EMITBKEY
-#CHECK:    frame-setup CFI_INSTRUCTION negate_ra_sign_state
 #CHECK:    frame-setup PACIBSP implicit-def $lr, implicit $lr, implicit $sp
+#CHECK:    frame-setup CFI_INSTRUCTION negate_ra_sign_state
 #CHECK:    frame-destroy AUTIBSP implicit-def $lr, implicit $lr, implicit $sp
 body:             |
   bb.0.entry:
diff --git a/llvm/test/CodeGen/RISCV/pr135206.ll b/llvm/test/CodeGen/RISCV/pr135206.ll
new file mode 100644
index 0000000000000..859179f62d704
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/pr135206.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple riscv64 < %s -o - | FileCheck %s
+
+%"buff" = type { [4096 x i64] }
+
+declare void @llvm.memset.p0.i64(ptr, i8, i64, i1)
+declare void @bar()
+
+define i1 @foo() nounwind "probe-stack"="inline-asm" "target-features"="+v" {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -2032
+; CHECK-NEXT:    sd ra, 2024(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s0, 2016(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s1, 2008(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s2, 2000(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s3, 1992(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    lui a0, 7
+; CHECK-NEXT:    sub t1, sp, a0
+; CHECK-NEXT:    lui t2, 1
+; CHECK-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, t2
+; CHECK-NEXT:    sd zero, 0(sp)
+; CHECK-NEXT:    bne sp, t1, .LBB0_1
+; CHECK-NEXT:  # %bb.2:
+; CHECK-NEXT:    addi sp, sp, -2048
+; CHECK-NEXT:    addi sp, sp, -96
+; CHECK-NEXT:    csrr t1, vlenb
+; CHECK-NEXT:    lui t2, 1
+; CHECK-NEXT:  .LBB0_3: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, t2
+; CHECK-NEXT:    sd zero, 0(sp)
+; CHECK-NEXT:    sub t1, t1, t2
+; CHECK-NEXT:    bge t1, t2, .LBB0_3
+; CHECK-NEXT:  # %bb.4:
+; CHECK-NEXT:    sub sp, sp, t1
+; CHECK-NEXT:    li a0, 86
+; CHECK-NEXT:    addi s0, sp, 48
+; CHECK-NEXT:    addi s1, sp, 32
+; CHECK-NEXT:    addi s2, sp, 16
+; CHECK-NEXT:    lui a1, 353637
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v8, a0
+; CHECK-NEXT:    lui a0, 8
+; CHECK-NEXT:    addiw a0, a0, 32
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    addiw a0, a1, 1622
+; CHECK-NEXT:    vse8.v v8, (s0)
+; CHECK-NEXT:    vse8.v v8, (s1)
+; CHECK-NEXT:    vse8.v v8, (s2)
+; CHECK-NEXT:    slli a1, a0, 32
+; CHECK-NEXT:    add s3, a0, a1
+; CHECK-NEXT:    sd s3, 64(sp)
+; CHECK-NEXT:    call bar
+; CHECK-NEXT:    lui a0, 8
+; CHECK-NEXT:    addiw a0, a0, 32
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vse8.v v8, (s0)
+; CHECK-NEXT:    vse8.v v8, (s1)
+; CHECK-NEXT:    vse8.v v8, (s2)
+; CHECK-NEXT:    sd s3, 64(sp)
+; CHECK-NEXT:    li a0, 0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    add sp, sp, a1
+; CHECK-NEXT:    lui a1, 8
+; CHECK-NEXT:    addiw a1, a1, -1952
+; CHECK-NEXT:    add sp, sp, a1
+; CHECK-NEXT:    ld ra, 2024(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s0, 2016(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s1, 2008(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s2, 2000(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s3, 1992(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    addi sp, sp, 2032
+; CHECK-NEXT:    ret
+  %1 = alloca %"buff", align 8
+  call void @llvm.memset.p0.i64(ptr %1, i8 86, i64 56, i1 false)
+  call void @bar()
+  call void @llvm.memset.p0.i64(ptr %1, i8 86, i64 56, i1 false)
+  ret i1 false
+}
+
diff --git a/llvm/test/CodeGen/SystemZ/args-12.ll b/llvm/test/CodeGen/SystemZ/args-12.ll
index f8954eee550f5..472672bbfd5ca 100644
--- a/llvm/test/CodeGen/SystemZ/args-12.ll
+++ b/llvm/test/CodeGen/SystemZ/args-12.ll
@@ -2,7 +2,7 @@
 ; Test the handling of i128 argument values
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 declare void @bar(i64, i64, i64, i64, i128,
                   i64, i64, i64, i64, i128)
diff --git a/llvm/test/CodeGen/SystemZ/args-13.ll b/llvm/test/CodeGen/SystemZ/args-13.ll
index d9e986cbb6a4b..29a718901e811 100644
--- a/llvm/test/CodeGen/SystemZ/args-13.ll
+++ b/llvm/test/CodeGen/SystemZ/args-13.ll
@@ -2,7 +2,7 @@
 ; Test incoming i128 arguments.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 ; Do some arithmetic so that we can see the register being used.
 define void @f1(ptr %r2, i16 %r3, i32 %r4, i64 %r5, i128 %r6) {
diff --git a/llvm/test/CodeGen/SystemZ/bitop-intrinsics.ll b/llvm/test/CodeGen/SystemZ/bitop-intrinsics.ll
index f5b0aaa243a79..bbd9be463a014 100644
--- a/llvm/test/CodeGen/SystemZ/bitop-intrinsics.ll
+++ b/llvm/test/CodeGen/SystemZ/bitop-intrinsics.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; Test bit deposit / extract intrinsics
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 declare i64 @llvm.s390.bdepg(i64, i64)
 declare i64 @llvm.s390.bextg(i64, i64)
diff --git a/llvm/test/CodeGen/SystemZ/int-abs-03.ll b/llvm/test/CodeGen/SystemZ/int-abs-03.ll
index 238b2431c9b30..2a8969c27fbc0 100644
--- a/llvm/test/CodeGen/SystemZ/int-abs-03.ll
+++ b/llvm/test/CodeGen/SystemZ/int-abs-03.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; Test 128-bit absolute value in vector registers on arch15
+; Test 128-bit absolute value in vector registers on z17
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 define i128 @f1(i128 %src) {
 ; CHECK-LABEL: f1:
diff --git a/llvm/test/CodeGen/SystemZ/int-add-19.ll b/llvm/test/CodeGen/SystemZ/int-add-19.ll
index a9bce2c827ff9..f5ef08b4514f9 100644
--- a/llvm/test/CodeGen/SystemZ/int-add-19.ll
+++ b/llvm/test/CodeGen/SystemZ/int-add-19.ll
@@ -2,7 +2,7 @@
 ; Test 128-bit addition in vector registers on z13 and later
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 define i128 @f1(i128 %a, i128 %b) {
 ; CHECK-LABEL: f1:
diff --git a/llvm/test/CodeGen/SystemZ/int-cmp-64.ll b/llvm/test/CodeGen/SystemZ/int-cmp-64.ll
index be212ef2a7211..821a57bf30bc1 100644
--- a/llvm/test/CodeGen/SystemZ/int-cmp-64.ll
+++ b/llvm/test/CodeGen/SystemZ/int-cmp-64.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; Test 128-bit comparisons in vector registers on arch15
+; Test 128-bit comparisons in vector registers on z17
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 -verify-machineinstrs | FileCheck %s
 
 ; Equality comparison.
 define i64 @f1(i128 %value1, i128 %value2, i64 %a, i64 %b) {
diff --git a/llvm/test/CodeGen/SystemZ/int-conv-15.ll b/llvm/test/CodeGen/SystemZ/int-conv-15.ll
index bea0bb8890315..0d8ee75b10b85 100644
--- a/llvm/test/CodeGen/SystemZ/int-conv-15.ll
+++ b/llvm/test/CodeGen/SystemZ/int-conv-15.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; Test 128-bit arithmetic in vector registers on arch15
+; Test 128-bit arithmetic in vector registers on z17
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 ; Sign extension from i64.
 define i128 @f1(i64 %a) {
diff --git a/llvm/test/CodeGen/SystemZ/int-div-08.ll b/llvm/test/CodeGen/SystemZ/int-div-08.ll
index a3723c1257974..5838d4913c862 100644
--- a/llvm/test/CodeGen/SystemZ/int-div-08.ll
+++ b/llvm/test/CodeGen/SystemZ/int-div-08.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; Test 128-bit division and remainder in vector registers on arch15
+; Test 128-bit division and remainder in vector registers on z17
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 ; Divide signed.
 define i128 @f1(i128 %a, i128 %b) {
diff --git a/llvm/test/CodeGen/SystemZ/int-max-02.ll b/llvm/test/CodeGen/SystemZ/int-max-02.ll
index bd5e9593e25e9..5f5188c66065d 100644
--- a/llvm/test/CodeGen/SystemZ/int-max-02.ll
+++ b/llvm/test/CodeGen/SystemZ/int-max-02.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; Test i128 maximum on arch15.
+; Test i128 maximum on z17.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 ; Test with slt.
 define i128 @f1(i128 %val1, i128 %val2) {
diff --git a/llvm/test/CodeGen/SystemZ/int-min-02.ll b/llvm/test/CodeGen/SystemZ/int-min-02.ll
index e4cdd25fbc006..3066af924fb8e 100644
--- a/llvm/test/CodeGen/SystemZ/int-min-02.ll
+++ b/llvm/test/CodeGen/SystemZ/int-min-02.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; Test i128 minimum on arch15.
+; Test i128 minimum on z17.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 ; Test with slt.
 define i128 @f1(i128 %val1, i128 %val2) {
diff --git a/llvm/test/CodeGen/SystemZ/int-mul-14.ll b/llvm/test/CodeGen/SystemZ/int-mul-14.ll
index e7e0889634d10..6678e90f3bfad 100644
--- a/llvm/test/CodeGen/SystemZ/int-mul-14.ll
+++ b/llvm/test/CodeGen/SystemZ/int-mul-14.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; Test 128-bit multiplication in vector registers on arch15
+; Test 128-bit multiplication in vector registers on z17
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 ; Multiplication.
 define i128 @f1(i128 %a, i128 %b) {
diff --git a/llvm/test/CodeGen/SystemZ/int-mul-15.ll b/llvm/test/CodeGen/SystemZ/int-mul-15.ll
index a4a0faa0cb0c8..b7d41412d9c5f 100644
--- a/llvm/test/CodeGen/SystemZ/int-mul-15.ll
+++ b/llvm/test/CodeGen/SystemZ/int-mul-15.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; Test high-part i64->i128 multiplications on arch15.
+; Test high-part i64->i128 multiplications on z17.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 ; Check zero-extended multiplication in which only the high part is used.
 define i64 @f1(i64 %dummy, i64 %a, i64 %b) {
diff --git a/llvm/test/CodeGen/SystemZ/int-mul-16.ll b/llvm/test/CodeGen/SystemZ/int-mul-16.ll
index d84ca93e3b12c..772c419dfc8e0 100644
--- a/llvm/test/CodeGen/SystemZ/int-mul-16.ll
+++ b/llvm/test/CodeGen/SystemZ/int-mul-16.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; Test high-part i128->i256 multiplications on arch15.
+; Test high-part i128->i256 multiplications on z17.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 ; Multiply high signed.
 define i128 @f1(i128 %a, i128 %b) {
diff --git a/llvm/test/CodeGen/SystemZ/int-neg-04.ll b/llvm/test/CodeGen/SystemZ/int-neg-04.ll
index 05b7b397e735d..a6da2db7d14b4 100644
--- a/llvm/test/CodeGen/SystemZ/int-neg-04.ll
+++ b/llvm/test/CodeGen/SystemZ/int-neg-04.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; Test 128-bit negation in vector registers on arch15
+; Test 128-bit negation in vector registers on z17
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 define i128 @f1(i128 %src) {
 ; CHECK-LABEL: f1:
diff --git a/llvm/test/CodeGen/SystemZ/int-sub-12.ll b/llvm/test/CodeGen/SystemZ/int-sub-12.ll
index 8f7d816d5cbd2..44d2adfb41dc7 100644
--- a/llvm/test/CodeGen/SystemZ/int-sub-12.ll
+++ b/llvm/test/CodeGen/SystemZ/int-sub-12.ll
@@ -2,7 +2,7 @@
 ; Test 128-bit subtraction in vector registers on z13 and later
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 define i128 @f1(i128 %a, i128 %b) {
 ; CHECK-LABEL: f1:
diff --git a/llvm/test/CodeGen/SystemZ/llxa-01.ll b/llvm/test/CodeGen/SystemZ/llxa-01.ll
index 19bc6ef31a286..2c57556dc9ee2 100644
--- a/llvm/test/CodeGen/SystemZ/llxa-01.ll
+++ b/llvm/test/CodeGen/SystemZ/llxa-01.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; Test use of LOAD LOGICAL INDEXED ADDRESS byte instructions.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 ; DO NOT USE: LLXAB with base and index.
 define dso_local ptr @f0(ptr %ptr, i32 %idx) {
diff --git a/llvm/test/CodeGen/SystemZ/llxa-02.ll b/llvm/test/CodeGen/SystemZ/llxa-02.ll
index 0ca2527dcb25e..e2cd929a0bc94 100644
--- a/llvm/test/CodeGen/SystemZ/llxa-02.ll
+++ b/llvm/test/CodeGen/SystemZ/llxa-02.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; Test use of LOAD LOGICAL INDEXED ADDRESS halfword instructions.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 ; LLXAH with base and index.
 define dso_local ptr @f0(ptr %ptr, i32 %idx) {
diff --git a/llvm/test/CodeGen/SystemZ/llxa-03.ll b/llvm/test/CodeGen/SystemZ/llxa-03.ll
index b6c9406785188..b5c91b1d7e607 100644
--- a/llvm/test/CodeGen/SystemZ/llxa-03.ll
+++ b/llvm/test/CodeGen/SystemZ/llxa-03.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; Test use of LOAD LOGICAL INDEXED ADDRESS word instructions.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 ; LLXAF with base and index.
 define dso_local ptr @f0(ptr %ptr, i32 %idx) {
diff --git a/llvm/test/CodeGen/SystemZ/llxa-04.ll b/llvm/test/CodeGen/SystemZ/llxa-04.ll
index 9c5cd2f54bc67..186892dd755a7 100644
--- a/llvm/test/CodeGen/SystemZ/llxa-04.ll
+++ b/llvm/test/CodeGen/SystemZ/llxa-04.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; Test use of LOAD LOGICAL INDEXED ADDRESS doubleword instructions.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 ; LLXAG with base and index.
 define dso_local ptr @f0(ptr %ptr, i32 %idx) {
diff --git a/llvm/test/CodeGen/SystemZ/llxa-05.ll b/llvm/test/CodeGen/SystemZ/llxa-05.ll
index eba400f6d2564..1e5880de57d58 100644
--- a/llvm/test/CodeGen/SystemZ/llxa-05.ll
+++ b/llvm/test/CodeGen/SystemZ/llxa-05.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; Test use of LOAD LOGICAL INDEXED ADDRESS quadword instructions.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 ; LLXAQ with base and index.
 define dso_local ptr @f0(ptr %ptr, i32 %idx) {
diff --git a/llvm/test/CodeGen/SystemZ/lxa-01.ll b/llvm/test/CodeGen/SystemZ/lxa-01.ll
index fb3edeaaeb381..8bba6f78f503d 100644
--- a/llvm/test/CodeGen/SystemZ/lxa-01.ll
+++ b/llvm/test/CodeGen/SystemZ/lxa-01.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; Test use of LOAD INDEXED ADDRESS byte instructions.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 ; DO NOT USE: LXAB with base and index.
 define dso_local ptr @f0(ptr %ptr, i32 %idx) {
diff --git a/llvm/test/CodeGen/SystemZ/lxa-02.ll b/llvm/test/CodeGen/SystemZ/lxa-02.ll
index 64816fa24838e..c233bf7d28a5a 100644
--- a/llvm/test/CodeGen/SystemZ/lxa-02.ll
+++ b/llvm/test/CodeGen/SystemZ/lxa-02.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; Test use of LOAD INDEXED ADDRESS halfword instructions.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 ; LXAH with base and index.
 define dso_local ptr @f0(ptr %ptr, i32 %idx) {
diff --git a/llvm/test/CodeGen/SystemZ/lxa-03.ll b/llvm/test/CodeGen/SystemZ/lxa-03.ll
index e73d43a48ebd8..43e9b4d14d6c6 100644
--- a/llvm/test/CodeGen/SystemZ/lxa-03.ll
+++ b/llvm/test/CodeGen/SystemZ/lxa-03.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; Test use of LOAD INDEXED ADDRESS word instructions.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 ; LXAF with base and index.
 define dso_local ptr @f0(ptr %ptr, i32 %idx) {
diff --git a/llvm/test/CodeGen/SystemZ/lxa-04.ll b/llvm/test/CodeGen/SystemZ/lxa-04.ll
index 7b6764cf22faf..96af585547e34 100644
--- a/llvm/test/CodeGen/SystemZ/lxa-04.ll
+++ b/llvm/test/CodeGen/SystemZ/lxa-04.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; Test use of LOAD INDEXED ADDRESS doubleword instructions.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 ; LXAG with base and index.
 define dso_local ptr @f0(ptr %ptr, i32 %idx) {
diff --git a/llvm/test/CodeGen/SystemZ/lxa-05.ll b/llvm/test/CodeGen/SystemZ/lxa-05.ll
index 0a45cba0b3f83..4f0b4e838f157 100644
--- a/llvm/test/CodeGen/SystemZ/lxa-05.ll
+++ b/llvm/test/CodeGen/SystemZ/lxa-05.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; Test use of LOAD INDEXED ADDRESS quadword instructions.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 ; LXAQ with base and index.
 define dso_local ptr @f0(ptr %ptr, i32 %idx) {
diff --git a/llvm/test/CodeGen/SystemZ/scalar-ctlz-03.ll b/llvm/test/CodeGen/SystemZ/scalar-ctlz-03.ll
index f18ee2418383c..3dbd18fb8cc60 100644
--- a/llvm/test/CodeGen/SystemZ/scalar-ctlz-03.ll
+++ b/llvm/test/CodeGen/SystemZ/scalar-ctlz-03.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 ;
 ; FIXME: two consecutive immediate adds not fused in i16/i8 functions.
 
diff --git a/llvm/test/CodeGen/SystemZ/scalar-ctlz-04.ll b/llvm/test/CodeGen/SystemZ/scalar-ctlz-04.ll
index bb50e6f417c42..10d28d571bb92 100644
--- a/llvm/test/CodeGen/SystemZ/scalar-ctlz-04.ll
+++ b/llvm/test/CodeGen/SystemZ/scalar-ctlz-04.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 declare i128 @llvm.ctlz.i128(i128, i1)
 
diff --git a/llvm/test/CodeGen/SystemZ/scalar-cttz-03.ll b/llvm/test/CodeGen/SystemZ/scalar-cttz-03.ll
index 2f3a72160ae27..e1237280ae23e 100644
--- a/llvm/test/CodeGen/SystemZ/scalar-cttz-03.ll
+++ b/llvm/test/CodeGen/SystemZ/scalar-cttz-03.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 declare i64 @llvm.cttz.i64(i64, i1)
 declare i32 @llvm.cttz.i32(i32, i1)
diff --git a/llvm/test/CodeGen/SystemZ/scalar-cttz-04.ll b/llvm/test/CodeGen/SystemZ/scalar-cttz-04.ll
index f440871fd4ff0..fdfebef1a1e18 100644
--- a/llvm/test/CodeGen/SystemZ/scalar-cttz-04.ll
+++ b/llvm/test/CodeGen/SystemZ/scalar-cttz-04.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; Test 128-bit arithmetic in vector registers on arch15
+; Test 128-bit arithmetic in vector registers on z17
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 declare i128 @llvm.cttz.i128(i128, i1)
 
diff --git a/llvm/test/CodeGen/SystemZ/vec-cmp-09.ll b/llvm/test/CodeGen/SystemZ/vec-cmp-09.ll
index 3f6c86e685ea1..cb8850e58c589 100644
--- a/llvm/test/CodeGen/SystemZ/vec-cmp-09.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-cmp-09.ll
@@ -1,6 +1,6 @@
-; Test usage of VBLEND on arch15.
+; Test usage of VBLEND on z17.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2, <16 x i8> %val3) {
 ; CHECK-LABEL: f1:
diff --git a/llvm/test/CodeGen/SystemZ/vec-div-03.ll b/llvm/test/CodeGen/SystemZ/vec-div-03.ll
index 96b161948e39b..1c2a702baf1a3 100644
--- a/llvm/test/CodeGen/SystemZ/vec-div-03.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-div-03.ll
@@ -1,6 +1,6 @@
-; Test vector division on arch15.
+; Test vector division on z17.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 ; Test a v4i32 signed division.
 define <4 x i32> @f1(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
diff --git a/llvm/test/CodeGen/SystemZ/vec-eval.ll b/llvm/test/CodeGen/SystemZ/vec-eval.ll
index 262ab0ea8bb2b..bcdedcd3a407b 100644
--- a/llvm/test/CodeGen/SystemZ/vec-eval.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-eval.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; Test use of VECTOR EVALUATE for combined boolean operations.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 define <16 x i8> @eval0(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
 ; CHECK-LABEL: eval0:
@@ -279,8 +279,8 @@ entry:
 define <16 x i8> @eval24(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
 ; CHECK-LABEL: eval24:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vo %v1, %v28, %v26
 ; CHECK-NEXT:    veval %v0, %v26, %v28, %v24, 2
+; CHECK-NEXT:    vo %v1, %v28, %v26
 ; CHECK-NEXT:    veval %v24, %v0, %v24, %v1, 47
 ; CHECK-NEXT:    br %r14
 entry:
@@ -376,8 +376,8 @@ entry:
 define <16 x i8> @eval30(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
 ; CHECK-LABEL: eval30:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vn %v1, %v28, %v26
 ; CHECK-NEXT:    veval %v0, %v26, %v28, %v24, 2
+; CHECK-NEXT:    vn %v1, %v28, %v26
 ; CHECK-NEXT:    veval %v24, %v0, %v24, %v1, 47
 ; CHECK-NEXT:    br %r14
 entry:
@@ -596,8 +596,8 @@ entry:
 define <16 x i8> @eval45(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
 ; CHECK-LABEL: eval45:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vo %v0, %v28, %v24
 ; CHECK-NEXT:    veval %v1, %v26, %v24, %v28, 1
+; CHECK-NEXT:    vo %v0, %v28, %v24
 ; CHECK-NEXT:    veval %v1, %v1, %v24, %v26, 47
 ; CHECK-NEXT:    veval %v24, %v1, %v26, %v0, 47
 ; CHECK-NEXT:    br %r14
@@ -617,8 +617,8 @@ entry:
 define <16 x i8> @eval46(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
 ; CHECK-LABEL: eval46:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vn %v1, %v28, %v26
 ; CHECK-NEXT:    veval %v0, %v26, %v28, %v24, 8
+; CHECK-NEXT:    vn %v1, %v28, %v26
 ; CHECK-NEXT:    veval %v24, %v0, %v24, %v1, 47
 ; CHECK-NEXT:    br %r14
 entry:
@@ -722,8 +722,8 @@ entry:
 define <16 x i8> @eval54(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
 ; CHECK-LABEL: eval54:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vn %v1, %v28, %v24
 ; CHECK-NEXT:    veval %v0, %v24, %v28, %v26, 2
+; CHECK-NEXT:    vn %v1, %v28, %v24
 ; CHECK-NEXT:    veval %v24, %v0, %v26, %v1, 47
 ; CHECK-NEXT:    br %r14
 entry:
@@ -770,8 +770,8 @@ entry:
 define <16 x i8> @eval57(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
 ; CHECK-LABEL: eval57:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vo %v0, %v28, %v26
 ; CHECK-NEXT:    veval %v1, %v26, %v24, %v28, 1
+; CHECK-NEXT:    vo %v0, %v28, %v26
 ; CHECK-NEXT:    veval %v1, %v1, %v26, %v24, 47
 ; CHECK-NEXT:    veval %v24, %v1, %v24, %v0, 47
 ; CHECK-NEXT:    br %r14
@@ -1060,8 +1060,8 @@ define <16 x i8> @eval77(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
 ; CHECK-LABEL: eval77:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vgbm %v0, 65535
-; CHECK-NEXT:    vn %v1, %v26, %v24
 ; CHECK-NEXT:    veval %v0, %v24, %v0, %v26, 40
+; CHECK-NEXT:    vn %v1, %v26, %v24
 ; CHECK-NEXT:    veval %v0, %v28, %v0, %v1, 7
 ; CHECK-NEXT:    veval %v24, %v0, %v24, %v26, 47
 ; CHECK-NEXT:    br %r14
@@ -1540,10 +1540,10 @@ define <16 x i8> @eval109(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
 ; CHECK-LABEL: eval109:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vgbm %v0, 65535
-; CHECK-NEXT:    vn %v2, %v26, %v24
 ; CHECK-NEXT:    veval %v0, %v24, %v0, %v26, 40
-; CHECK-NEXT:    vo %v1, %v28, %v24
+; CHECK-NEXT:    vn %v2, %v26, %v24
 ; CHECK-NEXT:    veval %v0, %v28, %v0, %v2, 7
+; CHECK-NEXT:    vo %v1, %v28, %v24
 ; CHECK-NEXT:    veval %v0, %v0, %v24, %v26, 47
 ; CHECK-NEXT:    veval %v24, %v0, %v26, %v1, 47
 ; CHECK-NEXT:    br %r14
@@ -1621,8 +1621,8 @@ define <16 x i8> @eval113(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
 ; CHECK-LABEL: eval113:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vgbm %v0, 65535
-; CHECK-NEXT:    vn %v1, %v26, %v24
 ; CHECK-NEXT:    veval %v0, %v24, %v0, %v26, 40
+; CHECK-NEXT:    vn %v1, %v26, %v24
 ; CHECK-NEXT:    veval %v0, %v28, %v0, %v1, 7
 ; CHECK-NEXT:    veval %v24, %v0, %v26, %v24, 47
 ; CHECK-NEXT:    br %r14
@@ -1731,8 +1731,8 @@ define <16 x i8> @eval120(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vno %v0, %v24, %v24
 ; CHECK-NEXT:    veval %v0, %v0, %v28, %v26, 2
-; CHECK-NEXT:    vo %v1, %v28, %v26
 ; CHECK-NEXT:    veval %v0, %v0, %v26, %v24, 47
+; CHECK-NEXT:    vo %v1, %v28, %v26
 ; CHECK-NEXT:    veval %v24, %v0, %v24, %v1, 47
 ; CHECK-NEXT:    br %r14
 entry:
@@ -1753,10 +1753,10 @@ define <16 x i8> @eval121(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
 ; CHECK-LABEL: eval121:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vgbm %v0, 65535
-; CHECK-NEXT:    vn %v2, %v26, %v24
 ; CHECK-NEXT:    veval %v0, %v24, %v0, %v26, 40
-; CHECK-NEXT:    vo %v1, %v28, %v26
+; CHECK-NEXT:    vn %v2, %v26, %v24
 ; CHECK-NEXT:    veval %v0, %v28, %v0, %v2, 7
+; CHECK-NEXT:    vo %v1, %v28, %v26
 ; CHECK-NEXT:    veval %v0, %v0, %v26, %v24, 47
 ; CHECK-NEXT:    veval %v24, %v0, %v24, %v1, 47
 ; CHECK-NEXT:    br %r14
@@ -1802,8 +1802,8 @@ define <16 x i8> @eval123(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vno %v0, %v24, %v24
 ; CHECK-NEXT:    veval %v0, %v0, %v28, %v26, 2
-; CHECK-NEXT:    voc %v1, %v26, %v28
 ; CHECK-NEXT:    veval %v0, %v0, %v26, %v24, 47
+; CHECK-NEXT:    voc %v1, %v26, %v28
 ; CHECK-NEXT:    veval %v24, %v0, %v1, %v24, 31
 ; CHECK-NEXT:    br %r14
 entry:
@@ -2084,8 +2084,8 @@ entry:
 define <16 x i8> @eval141(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
 ; CHECK-LABEL: eval141:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vo %v0, %v26, %v24
 ; CHECK-NEXT:    veval %v1, %v26, %v24, %v28, 1
+; CHECK-NEXT:    vo %v0, %v26, %v24
 ; CHECK-NEXT:    veval %v1, %v1, %v24, %v26, 47
 ; CHECK-NEXT:    veval %v24, %v1, %v0, %v28, 143
 ; CHECK-NEXT:    br %r14
@@ -2105,8 +2105,8 @@ entry:
 define <16 x i8> @eval142(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
 ; CHECK-LABEL: eval142:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vn %v1, %v28, %v26
 ; CHECK-NEXT:    veval %v0, %v26, %v24, %v28, 127
+; CHECK-NEXT:    vn %v1, %v28, %v26
 ; CHECK-NEXT:    veval %v24, %v24, %v1, %v0, 174
 ; CHECK-NEXT:    br %r14
 entry:
@@ -2253,8 +2253,8 @@ entry:
 define <16 x i8> @eval151(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
 ; CHECK-LABEL: eval151:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vx %v0, %v28, %v26
 ; CHECK-NEXT:    veval %v1, %v24, %v28, %v26, 2
+; CHECK-NEXT:    vx %v0, %v28, %v26
 ; CHECK-NEXT:    veval %v1, %v1, %v26, %v24, 31
 ; CHECK-NEXT:    veval %v24, %v1, %v0, %v24, 143
 ; CHECK-NEXT:    br %r14
@@ -2289,8 +2289,8 @@ entry:
 define <16 x i8> @eval153(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
 ; CHECK-LABEL: eval153:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vo %v1, %v28, %v26
 ; CHECK-NEXT:    veval %v0, %v24, %v28, %v26, 111
+; CHECK-NEXT:    vo %v1, %v28, %v26
 ; CHECK-NEXT:    veval %v2, %v26, %v24, %v28, 1
 ; CHECK-NEXT:    veval %v24, %v2, %v0, %v1, 239
 ; CHECK-NEXT:    br %r14
@@ -2309,8 +2309,8 @@ entry:
 define <16 x i8> @eval154(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
 ; CHECK-LABEL: eval154:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vo %v1, %v28, %v26
 ; CHECK-NEXT:    veval %v0, %v24, %v28, %v26, 111
+; CHECK-NEXT:    vo %v1, %v28, %v26
 ; CHECK-NEXT:    veval %v2, %v24, %v26, %v28, 2
 ; CHECK-NEXT:    veval %v24, %v2, %v0, %v1, 239
 ; CHECK-NEXT:    br %r14
@@ -2330,9 +2330,9 @@ entry:
 define <16 x i8> @eval155(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
 ; CHECK-LABEL: eval155:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    veval %v0, %v24, %v28, %v26, 111
 ; CHECK-NEXT:    vo %v1, %v28, %v26
 ; CHECK-NEXT:    vn %v2, %v26, %v24
-; CHECK-NEXT:    veval %v0, %v24, %v28, %v26, 111
 ; CHECK-NEXT:    veval %v24, %v2, %v0, %v1, 239
 ; CHECK-NEXT:    br %r14
 entry:
@@ -2365,8 +2365,8 @@ entry:
 define <16 x i8> @eval157(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
 ; CHECK-LABEL: eval157:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vx %v0, %v28, %v26
 ; CHECK-NEXT:    veval %v1, %v26, %v24, %v28, 1
+; CHECK-NEXT:    vx %v0, %v28, %v26
 ; CHECK-NEXT:    veval %v1, %v1, %v24, %v26, 47
 ; CHECK-NEXT:    veval %v24, %v1, %v0, %v24, 143
 ; CHECK-NEXT:    br %r14
@@ -2386,8 +2386,8 @@ entry:
 define <16 x i8> @eval158(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
 ; CHECK-LABEL: eval158:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vn %v1, %v28, %v26
 ; CHECK-NEXT:    veval %v0, %v24, %v28, %v26, 111
+; CHECK-NEXT:    vn %v1, %v28, %v26
 ; CHECK-NEXT:    veval %v24, %v24, %v1, %v0, 174
 ; CHECK-NEXT:    br %r14
 entry:
@@ -2685,8 +2685,8 @@ entry:
 define <16 x i8> @eval178(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
 ; CHECK-LABEL: eval178:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vn %v1, %v26, %v24
 ; CHECK-NEXT:    veval %v0, %v26, %v28, %v24, 138
+; CHECK-NEXT:    vn %v1, %v26, %v24
 ; CHECK-NEXT:    veval %v24, %v0, %v1, %v28, 47
 ; CHECK-NEXT:    br %r14
 entry:
@@ -2778,8 +2778,8 @@ entry:
 define <16 x i8> @eval183(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
 ; CHECK-LABEL: eval183:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    voc %v0, %v26, %v28
 ; CHECK-NEXT:    veval %v1, %v24, %v28, %v26, 2
+; CHECK-NEXT:    voc %v0, %v26, %v28
 ; CHECK-NEXT:    veval %v1, %v1, %v26, %v24, 31
 ; CHECK-NEXT:    veval %v24, %v1, %v0, %v24, 47
 ; CHECK-NEXT:    br %r14
@@ -2884,8 +2884,8 @@ entry:
 define <16 x i8> @eval189(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
 ; CHECK-LABEL: eval189:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    voc %v0, %v26, %v28
 ; CHECK-NEXT:    veval %v1, %v26, %v24, %v28, 1
+; CHECK-NEXT:    voc %v0, %v26, %v28
 ; CHECK-NEXT:    veval %v1, %v1, %v24, %v26, 47
 ; CHECK-NEXT:    veval %v24, %v1, %v0, %v24, 47
 ; CHECK-NEXT:    br %r14
@@ -3480,8 +3480,8 @@ define <16 x i8> @eval228(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
 ; CHECK-LABEL: eval228:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vno %v0, %v26, %v26
-; CHECK-NEXT:    vo %v1, %v28, %v24
 ; CHECK-NEXT:    veval %v2, %v24, %v28, %v26, 2
+; CHECK-NEXT:    vo %v1, %v28, %v24
 ; CHECK-NEXT:    veval %v0, %v2, %v0, %v24, 47
 ; CHECK-NEXT:    veval %v24, %v0, %v26, %v1, 47
 ; CHECK-NEXT:    br %r14
@@ -3564,8 +3564,8 @@ entry:
 define <16 x i8> @eval232(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
 ; CHECK-LABEL: eval232:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vo %v1, %v28, %v26
 ; CHECK-NEXT:    veval %v0, %v24, %v28, %v26, 31
+; CHECK-NEXT:    vo %v1, %v28, %v26
 ; CHECK-NEXT:    veval %v24, %v24, %v1, %v0, 174
 ; CHECK-NEXT:    br %r14
 entry:
@@ -3582,8 +3582,8 @@ entry:
 define <16 x i8> @eval233(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
 ; CHECK-LABEL: eval233:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vx %v1, %v28, %v26
 ; CHECK-NEXT:    veval %v0, %v24, %v28, %v26, 31
+; CHECK-NEXT:    vx %v1, %v28, %v26
 ; CHECK-NEXT:    veval %v24, %v24, %v1, %v0, 174
 ; CHECK-NEXT:    br %r14
 entry:
diff --git a/llvm/test/CodeGen/SystemZ/vec-intrinsics-05.ll b/llvm/test/CodeGen/SystemZ/vec-intrinsics-05.ll
index e750f1e3e7b47..5bbabdd2d56fc 100644
--- a/llvm/test/CodeGen/SystemZ/vec-intrinsics-05.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-intrinsics-05.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; Test vector intrinsics added with arch15.
+; Test vector intrinsics added with z17.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 declare <16 x i8> @llvm.s390.vgemb(<8 x i16>)
 declare <8 x i16> @llvm.s390.vgemh(<16 x i8>)
diff --git a/llvm/test/CodeGen/SystemZ/vec-mul-06.ll b/llvm/test/CodeGen/SystemZ/vec-mul-06.ll
index 22b1b5de62c57..3850a8f60eb16 100644
--- a/llvm/test/CodeGen/SystemZ/vec-mul-06.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-mul-06.ll
@@ -1,6 +1,6 @@
-; Test vector multiplication on arch15.
+; Test vector multiplication on z17.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z17 | FileCheck %s
 
 ; Test a v2i64 multiplication.
 define <2 x i64> @f1(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
diff --git a/llvm/test/CodeGen/WebAssembly/inline-asm.ll b/llvm/test/CodeGen/WebAssembly/inline-asm.ll
index 4462cfb7aa0c4..c378fd953a555 100644
--- a/llvm/test/CodeGen/WebAssembly/inline-asm.ll
+++ b/llvm/test/CodeGen/WebAssembly/inline-asm.ll
@@ -129,7 +129,18 @@ entry:
   ret i32 %ret
 }
 
+; CHECK-LABEL: v128_load
+; CHECK: local.get 0
+; CHECK-NEXT: v128.load 0
+; CHECK-NEXT: local.set 1
+define <4 x i32> @v128_load(ptr %v) #1 {
+entry:
+  %0 = tail call <4 x i32> asm "local.get $1\0Av128.load 0\0Alocal.set $0", "=r,r"(ptr %v)
+  ret <4 x i32> %0
+}
+
 attributes #0 = { nounwind }
+attributes #1 = { "target-features"="+simd128" }
 
 !0 = !{i32 47}
 !1 = !{i32 145}
diff --git a/llvm/test/CodeGen/X86/pr138982.ll b/llvm/test/CodeGen/X86/pr138982.ll
new file mode 100644
index 0000000000000..32346d823a9fe
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr138982.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64 -mattr=+fma | FileCheck %s
+
+define <4 x float> @pr138982(<4 x float> %in_vec) {
+; CHECK-LABEL: pr138982:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; CHECK-NEXT:    vrcpps %xmm0, %xmm2
+; CHECK-NEXT:    vrcpps %xmm1, %xmm1
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpneqps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; CHECK-NEXT:    vblendvps %xmm0, %xmm1, %xmm4, %xmm0
+; CHECK-NEXT:    vfnmadd231ps {{.*#+}} xmm0 = -(xmm3 * xmm2) + xmm0
+; CHECK-NEXT:    retq
+entry:
+  %fneg = fneg <4 x float> %in_vec
+  %rcp = tail call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %fneg)
+  %cmp = fcmp une <4 x float> zeroinitializer, %in_vec
+  %sel = select <4 x i1> %cmp, <4 x float> %rcp, <4 x float> splat (float 1.000000e+00)
+  %fma = call nsz <4 x float> @llvm.fma.v4f32(<4 x float> %rcp, <4 x float> zeroinitializer, <4 x float> %sel)
+  ret <4 x float> %fma
+}
diff --git a/llvm/test/CodeGen/X86/pr140491-sincos-lifetimes.ll b/llvm/test/CodeGen/X86/pr140491-sincos-lifetimes.ll
new file mode 100644
index 0000000000000..2ca99bdc4b316
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr140491-sincos-lifetimes.ll
@@ -0,0 +1,70 @@
+; RUN: llc < %s | FileCheck %s
+
+; This test is reduced from https://github.com/llvm/llvm-project/issues/140491.
+; It checks that when `@llvm.sincos.f32` is expanded to a call to
+; `sincosf(float, float* out_sin, float* out_cos)` and the store of `%cos` to
+; `%computed` is folded into the `sincosf` call. The use of `%cos`in the later
+; `fneg %cos` -- which expands to a load of `%computed`, will perform the load
+; before the `@llvm.lifetime.end.p0(%computed)` to ensure the correct value is
+; taken for `%cos`.
+
+target triple = "x86_64-sie-ps5"
+
+declare void @use_ptr(ptr readonly)
+
+define i32 @sincos_stack_slot_with_lifetime(float %in)  {
+; CHECK-LABEL: sincos_stack_slot_with_lifetime:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    subq $32, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset %rbx, -16
+; CHECK-NEXT:    leaq 12(%rsp), %rdi
+; CHECK-NEXT:    leaq 8(%rsp), %rbx
+; CHECK-NEXT:    movq %rbx, %rsi
+; CHECK-NEXT:    callq sincosf@PLT
+; CHECK-NEXT:    movss 8(%rsp), %xmm0 # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movaps %xmm0, 16(%rsp) # 16-byte Spill
+; CHECK-NEXT:    movq %rbx, %rdi
+; CHECK-NEXT:    callq use_ptr
+; CHECK-NEXT:    movss 12(%rsp), %xmm0 # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movss %xmm0, 8(%rsp)
+; CHECK-NEXT:    leaq 8(%rsp), %rdi
+; CHECK-NEXT:    callq use_ptr
+; CHECK-NEXT:    movaps 16(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movss %xmm0, 8(%rsp)
+; CHECK-NEXT:    leaq 8(%rsp), %rdi
+; CHECK-NEXT:    callq use_ptr
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    addq $32, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %computed = alloca float, align 4
+  %computed1 = alloca float, align 4
+  %computed3 = alloca float, align 4
+  %sincos = tail call { float, float } @llvm.sincos.f32(float %in)
+  %sin = extractvalue { float, float } %sincos, 0
+  %cos = extractvalue { float, float } %sincos, 1
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %computed)
+  store float %cos, ptr %computed, align 4
+  call void @use_ptr(ptr nonnull %computed)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %computed)
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %computed1)
+  %fneg_sin = fneg float %sin
+  store float %fneg_sin, ptr %computed1, align 4
+  call void @use_ptr(ptr nonnull %computed1)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %computed1)
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %computed3)
+  %fneg_cos = fneg float %cos
+  store float %fneg_cos, ptr %computed3, align 4
+  call void @use_ptr(ptr nonnull %computed3)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %computed3)
+  ret i32 0
+}
+
diff --git a/llvm/test/CodeGen/X86/vector-trunc-nowrap.ll b/llvm/test/CodeGen/X86/vector-trunc-nowrap.ll
index 2b8eedfbbdc9c..863f30e03d2d6 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-nowrap.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-nowrap.ll
@@ -1592,3 +1592,91 @@ entry:
   %1 = bitcast <8 x i8> %0 to i64
   ret i64 %1
 }
+
+define void @foo(<4 x i64> %a, <4 x i64> %b, ptr %p) "min-legal-vector-width"="256" "prefer-vector-width"="256" {
+; SSE-LABEL: foo:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT:    movaps %xmm2, 16(%rdi)
+; SSE-NEXT:    movaps %xmm0, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: foo:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX1-NEXT:    vmovaps %xmm1, 16(%rdi)
+; AVX1-NEXT:    vmovaps %xmm0, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: foo:
+; AVX2-SLOW:       # %bb.0: # %entry
+; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT:    vmovaps %xmm1, 16(%rdi)
+; AVX2-SLOW-NEXT:    vmovaps %xmm0, (%rdi)
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-ALL-LABEL: foo:
+; AVX2-FAST-ALL:       # %bb.0: # %entry
+; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-FAST-ALL-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-FAST-ALL-NEXT:    vmovaps %xmm1, 16(%rdi)
+; AVX2-FAST-ALL-NEXT:    vmovaps %xmm0, (%rdi)
+; AVX2-FAST-ALL-NEXT:    vzeroupper
+; AVX2-FAST-ALL-NEXT:    retq
+;
+; AVX2-FAST-PERLANE-LABEL: foo:
+; AVX2-FAST-PERLANE:       # %bb.0: # %entry
+; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-FAST-PERLANE-NEXT:    vmovaps %xmm1, 16(%rdi)
+; AVX2-FAST-PERLANE-NEXT:    vmovaps %xmm0, (%rdi)
+; AVX2-FAST-PERLANE-NEXT:    vzeroupper
+; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512F-LABEL: foo:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovqd %zmm0, (%rdi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: foo:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpmovqd %ymm1, 16(%rdi)
+; AVX512VL-NEXT:    vpmovqd %ymm0, (%rdi)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: foo:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovqd %zmm0, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: foo:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovqd %ymm1, 16(%rdi)
+; AVX512BWVL-NEXT:    vpmovqd %ymm0, (%rdi)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = shufflevector <4 x i64> %a, <4 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %1 = trunc nsw <8 x i64> %0 to <8 x i32>
+  store <8 x i32> %1, ptr %p, align 16
+  ret void
+}
diff --git a/llvm/test/MC/Disassembler/SystemZ/insns-arch15.txt b/llvm/test/MC/Disassembler/SystemZ/insns-z17.txt
similarity index 99%
rename from llvm/test/MC/Disassembler/SystemZ/insns-arch15.txt
rename to llvm/test/MC/Disassembler/SystemZ/insns-z17.txt
index 93274e6659801..c5a30b072d991 100644
--- a/llvm/test/MC/Disassembler/SystemZ/insns-arch15.txt
+++ b/llvm/test/MC/Disassembler/SystemZ/insns-z17.txt
@@ -1,5 +1,5 @@
-# Test arch15 instructions that don't have PC-relative operands.
-# RUN: llvm-mc --disassemble %s -triple=s390x-linux-gnu -mcpu=arch15 \
+# Test z17 instructions that don't have PC-relative operands.
+# RUN: llvm-mc --disassemble %s -triple=s390x-linux-gnu -mcpu=z17 \
 # RUN:   | FileCheck %s
 
 # CHECK: bdepg %r0, %r0, %r0
diff --git a/llvm/test/MC/SystemZ/insn-bad-arch15.s b/llvm/test/MC/SystemZ/insn-bad-z17.s
similarity index 98%
rename from llvm/test/MC/SystemZ/insn-bad-arch15.s
rename to llvm/test/MC/SystemZ/insn-bad-z17.s
index 915efbc942306..02e26220490f4 100644
--- a/llvm/test/MC/SystemZ/insn-bad-arch15.s
+++ b/llvm/test/MC/SystemZ/insn-bad-z17.s
@@ -1,5 +1,5 @@
-# For arch15 only.
-# RUN: not llvm-mc -triple s390x-linux-gnu -mcpu=arch15 < %s 2> %t
+# For z17 only.
+# RUN: not llvm-mc -triple s390x-linux-gnu -mcpu=z17 < %s 2> %t
 # RUN: FileCheck < %t %s
 
 #CHECK: error: invalid use of indexed addressing
diff --git a/llvm/test/MC/SystemZ/insn-good-arch15.s b/llvm/test/MC/SystemZ/insn-good-z17.s
similarity index 99%
rename from llvm/test/MC/SystemZ/insn-good-arch15.s
rename to llvm/test/MC/SystemZ/insn-good-z17.s
index 46ff13db0b549..96f27137e4821 100644
--- a/llvm/test/MC/SystemZ/insn-good-arch15.s
+++ b/llvm/test/MC/SystemZ/insn-good-z17.s
@@ -1,5 +1,5 @@
-# For arch15 and above.
-# RUN: llvm-mc -triple s390x-linux-gnu -mcpu=arch15 -show-encoding %s \
+# For z17 and above.
+# RUN: llvm-mc -triple s390x-linux-gnu -mcpu=z17 -show-encoding %s \
 # RUN:   | FileCheck %s
 
 #CHECK: bdepg	%r0, %r0, %r0           # encoding: [0xb9,0x6d,0x00,0x00]
diff --git a/llvm/test/MC/X86/gotpcrel-non-globals.ll b/llvm/test/MC/X86/gotpcrel-non-globals.ll
new file mode 100644
index 0000000000000..222d2d73ff728
--- /dev/null
+++ b/llvm/test/MC/X86/gotpcrel-non-globals.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+; Check that we emit the `@bar_*` symbols, and that we don't emit multiple symbols.
+
+; CHECK-LABEL: .Lrel_0:
+; CHECK: .long   foo_0@GOTPCREL+0
+; CHECK-LABEL: .Lrel_1_failed:
+; CHECK: .long   bar_1-foo_0
+; CHECK-LABEL: .Lrel_2:
+; CHECK: .long   foo_2@GOTPCREL+0
+
+; CHECK: bar_0:
+; CHECK: bar_1:
+; CHECK: bar_2_indirect:
+
+@rel_0 = private unnamed_addr constant [1 x i32] [
+  i32 trunc (i64 sub (i64 ptrtoint (ptr @bar_0 to i64), i64 ptrtoint (ptr @rel_0 to i64)) to i32)]
+@rel_1_failed = private unnamed_addr constant [1 x i32] [
+  i32 trunc (i64 sub (i64 ptrtoint (ptr @bar_1 to i64), i64 ptrtoint (ptr @foo_0 to i64)) to i32)]
+@rel_2 = private unnamed_addr constant [1 x i32] [
+  i32 trunc (i64 sub (i64 ptrtoint (ptr @bar_2_indirect to i64), i64 ptrtoint (ptr @rel_2 to i64)) to i32)]
+@bar_0 = internal unnamed_addr constant ptr @foo_0, align 8
+@bar_1 = internal unnamed_addr constant ptr @foo_1, align 8
+@bar_2_indirect = internal unnamed_addr constant ptr @foo_2, align 8
+@foo_0 = external global ptr, align 8
+@foo_1 = external global ptr, align 8
+@foo_2 = external global ptr, align 8
+
+define void @foo(ptr %arg0, ptr %arg1) {
+  store ptr @bar_0, ptr %arg0, align 8
+  store ptr @bar_1, ptr %arg1, align 8
+  store ptr getelementptr (i8, ptr @bar_2_indirect, i32 1), ptr %arg1, align 8
+  ret void
+}
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/switch.ll b/llvm/test/Transforms/CorrelatedValuePropagation/switch.ll
index a0794d5efe932..7e6aa3eeebe20 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/switch.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/switch.ll
@@ -294,6 +294,42 @@ cleanup:
   ret i32 %retval.0
 }
 
+; Make sure that we don't branch into unreachable.
+
+define void @pr142286() {
+; CHECK-LABEL: define void @pr142286() {
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    br label [[LOOP2:%.*]]
+; CHECK:       loop2:
+; CHECK-NEXT:    br label [[LOOP3:%.*]]
+; CHECK:       loop3:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+start:
+  br label %loop
+
+loop:
+  %phi = phi i8 [ -1, %start ], [ 0, %loop3 ]
+  br label %loop2
+
+loop2:
+  br label %loop3
+
+loop3:
+  switch i8 %phi, label %exit [
+  i8 0, label %loop3
+  i8 1, label %loop2
+  i8 2, label %loop
+  ]
+
+exit:
+  ret void
+}
+
 declare i32 @call0()
 declare i32 @call1()
 declare i32 @call2()
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index 3c3111492fc68..e8a32cb1697a5 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -4901,3 +4901,50 @@ define i32 @src_simplify_2x_at_once_and(i32 %x, i32 %y) {
   %cond = select i1 %and0, i32 %sub, i32 %xor
   ret i32 %cond
 }
+
+define void @no_fold_masked_min_loop(ptr nocapture readonly %vals, ptr nocapture readonly %masks, ptr nocapture %out, i64 %n) {
+; CHECK-LABEL: @no_fold_masked_min_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXT_INDEX:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[ACC:%.*]] = phi i8 [ -1, [[ENTRY]] ], [ [[RES:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[VAL_PTR:%.*]] = getelementptr inbounds i8, ptr [[VALS:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[MASK_PTR:%.*]] = getelementptr inbounds i8, ptr [[MASKS:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr [[VAL_PTR]], align 1
+; CHECK-NEXT:    [[MASK:%.*]] = load i8, ptr [[MASK_PTR]], align 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i8 [[MASK]], 0
+; CHECK-NEXT:    [[MASKED_VAL:%.*]] = select i1 [[COND]], i8 [[VAL]], i8 -1
+; CHECK-NEXT:    [[RES]] = call i8 @llvm.umin.i8(i8 [[ACC]], i8 [[MASKED_VAL]])
+; CHECK-NEXT:    [[NEXT_INDEX]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXT_INDEX]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    store i8 [[RES]], ptr [[OUT:%.*]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [0, %entry], [%next_index, %loop]
+  %acc = phi i8 [255, %entry], [%res, %loop]
+
+  %val_ptr = getelementptr inbounds i8, ptr %vals, i64 %index
+  %mask_ptr = getelementptr inbounds i8, ptr %masks, i64 %index
+
+  %val = load i8, ptr %val_ptr, align 1
+  %mask = load i8, ptr %mask_ptr, align 1
+
+  %cond = icmp eq i8 %mask, 0
+  %masked_val = select i1 %cond, i8 %val, i8 -1
+  %res = call i8 @llvm.umin.i8(i8 %acc, i8 %masked_val)
+
+  %next_index = add i64 %index, 1
+  %done = icmp eq i64 %next_index, %n
+  br i1 %done, label %exit, label %loop
+
+exit:
+  store i8 %res, ptr %out, align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll b/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll
index f20077243273c..877dd1eefbae4 100644
--- a/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll
+++ b/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll
@@ -235,3 +235,38 @@ define <3 x i4> @shuf_bitcast_wrong_size(<2 x i8> %v, i8 %x) {
   %r = shufflevector <4 x i4> %b, <4 x i4> undef, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x i4> %r
 }
+
+; Negative test - chain of bitcasts.
+
+define <16 x i8> @shuf_bitcast_chain(<8 x i32> %v) {
+; CHECK-LABEL: @shuf_bitcast_chain(
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <8 x i32> [[V:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[C:%.*]] = bitcast <4 x i32> [[S]] to <16 x i8>
+; CHECK-NEXT:    ret <16 x i8> [[C]]
+;
+  %s = shufflevector <8 x i32> %v, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %a = bitcast <4 x i32> %s to <2 x i64>
+  %b = bitcast <2 x i64> %a to i128
+  %c = bitcast i128 %b to <16 x i8>
+  ret <16 x i8> %c
+}
+
+; Same as above, but showing why it's not feasable to implement the reverse
+; fold in VectorCombine (see #136998).
+
+define <4 x i32> @shuf_bitcast_chain_2(<8 x i32> %v) {
+; CHECK-LABEL: @shuf_bitcast_chain_2(
+; CHECK-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[V:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[V]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = or <4 x i32> [[S0]], [[S1]]
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %s0 = shufflevector <8 x i32> %v, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s1 = shufflevector <8 x i32> %v, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %b0 = bitcast <4 x i32> %s0 to i128
+  %b1 = bitcast <4 x i32> %s1 to i128
+  %c0 = bitcast i128 %b0 to <4 x i32>
+  %c1 = bitcast i128 %b1 to <4 x i32>
+  %r = or <4 x i32> %c0, %c1
+  ret <4 x i32> %r
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
index 254136b0b841a..2ec48a8637dae 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
@@ -325,3 +325,79 @@ cleanup:
   %retval.0 = phi i1 [ false, %if.then ], [ true, %if.end ]
   ret i1 %retval.0
 }
+
+define i8 @masked_min_reduction(ptr %data, ptr %mask) {
+; CHECK-LABEL: @masked_min_reduction(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <32 x i8> [ splat (i8 -1), [[ENTRY]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <32 x i8> [ splat (i8 -1), [[ENTRY]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <32 x i8> [ splat (i8 -1), [[ENTRY]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <32 x i8> [ splat (i8 -1), [[ENTRY]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[DATA:%.*]] = getelementptr i8, ptr [[DATA1:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[DATA]], i64 32
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[DATA]], i64 64
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[DATA]], i64 96
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[DATA]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <32 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <32 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <32 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[MASK:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP7]], i64 32
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP7]], i64 64
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[TMP7]], i64 96
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <32 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <32 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <32 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <32 x i8>, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <32 x i8> [[WIDE_LOAD7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <32 x i8> [[WIDE_LOAD8]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <32 x i8> [[WIDE_LOAD9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <32 x i8> [[WIDE_LOAD10]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = select <32 x i1> [[TMP8]], <32 x i8> [[WIDE_LOAD]], <32 x i8> splat (i8 -1)
+; CHECK-NEXT:    [[TMP13:%.*]] = select <32 x i1> [[TMP9]], <32 x i8> [[WIDE_LOAD4]], <32 x i8> splat (i8 -1)
+; CHECK-NEXT:    [[TMP14:%.*]] = select <32 x i1> [[TMP10]], <32 x i8> [[WIDE_LOAD5]], <32 x i8> splat (i8 -1)
+; CHECK-NEXT:    [[TMP15:%.*]] = select <32 x i1> [[TMP11]], <32 x i8> [[WIDE_LOAD6]], <32 x i8> splat (i8 -1)
+; CHECK-NEXT:    [[TMP16]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[VEC_PHI]], <32 x i8> [[TMP12]])
+; CHECK-NEXT:    [[TMP17]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[VEC_PHI1]], <32 x i8> [[TMP13]])
+; CHECK-NEXT:    [[TMP18]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[VEC_PHI2]], <32 x i8> [[TMP14]])
+; CHECK-NEXT:    [[TMP19]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[VEC_PHI3]], <32 x i8> [[TMP15]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 128
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[RDX_MINMAX:%.*]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[TMP16]], <32 x i8> [[TMP17]])
+; CHECK-NEXT:    [[RDX_MINMAX11:%.*]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[RDX_MINMAX]], <32 x i8> [[TMP18]])
+; CHECK-NEXT:    [[RDX_MINMAX12:%.*]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[RDX_MINMAX11]], <32 x i8> [[TMP19]])
+; CHECK-NEXT:    [[TMP21:%.*]] = tail call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> [[RDX_MINMAX12]])
+; CHECK-NEXT:    ret i8 [[TMP21]]
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %next, %loop ]
+  %acc = phi i8 [ 255, %entry ], [ %acc_next, %loop ]
+
+  %ptr_i = getelementptr i8, ptr %data, i64 %i
+  %val = load i8, ptr %ptr_i, align 1
+
+  %mask_ptr = getelementptr i8, ptr %mask, i64 %i
+  %m = load i8, ptr %mask_ptr, align 1
+  %cond = icmp eq i8 %m, 0
+
+  ; Use select to implement masking
+  %masked_val = select i1 %cond, i8 %val, i8 255
+
+  ; min reduction
+  %acc_next = call i8 @llvm.umin.i8(i8 %acc, i8 %masked_val)
+
+  %next = add i64 %i, 1
+  %cmp = icmp ult i64 %next, 1024
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i8 %acc_next
+}
diff --git a/llvm/test/Transforms/RelLookupTableConverter/unnamed_addr.ll b/llvm/test/Transforms/RelLookupTableConverter/unnamed_addr.ll
new file mode 100644
index 0000000000000..322c38d090fe1
--- /dev/null
+++ b/llvm/test/Transforms/RelLookupTableConverter/unnamed_addr.ll
@@ -0,0 +1,164 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+; REQUIRES: x86-registered-target
+; REQUIRES: aarch64-registered-target
+; RUN: opt < %s -passes=rel-lookup-table-converter -relocation-model=pic -mtriple=x86_64-apple-darwin -S | FileCheck -check-prefix=x86_64-apple-darwin %s
+; RUN: opt < %s -passes=rel-lookup-table-converter -relocation-model=pic -mtriple=aarch64 -S | FileCheck -check-prefix=aarch64 %s
+; RUN: opt < %s -passes=rel-lookup-table-converter -relocation-model=pic -mtriple=x86_64 -S | FileCheck -check-prefix=x86_64 %s
+
+@a0 = private unnamed_addr constant i32 0
+@a1 = private unnamed_addr constant i32 1
+@a2 = private unnamed_addr constant i32 2
+@load_relative_1.table = private unnamed_addr constant [3 x ptr] [ptr @a0, ptr @a1, ptr @a2]
+
+@x0 = internal unnamed_addr constant i64 0
+@x1 = internal unnamed_addr constant i64 1
+@x2 = internal unnamed_addr constant i64 2
+@x3 = internal unnamed_addr constant i64 3
+@y0 = internal unnamed_addr constant ptr @x3
+@y1 = internal unnamed_addr constant ptr @x2
+@y2 = internal unnamed_addr constant ptr @x1
+@y3 = internal unnamed_addr constant ptr @x0
+@load_relative_2.table = private unnamed_addr constant [4 x ptr] [ptr @y3, ptr @y2, ptr @y1, ptr @y0]
+
+@b0 = private unnamed_addr constant [8 x i8] c"00000000"
+@b1 = private unnamed_addr constant [8 x i8] c"11111111"
+@b2 = private unnamed_addr constant [8 x i8] c"22222222"
+@load_relative_3.table = private unnamed_addr constant [3 x ptr] [
+  ptr getelementptr inbounds (i8, ptr @b0, i64 8),
+  ptr getelementptr inbounds (i8, ptr @b1, i64 8),
+  ptr getelementptr inbounds (i8, ptr @b2, i64 8)]
+
+;.
+; x86_64-apple-darwin: @a0 = private constant i32 0
+; x86_64-apple-darwin: @a1 = private constant i32 1
+; x86_64-apple-darwin: @a2 = private constant i32 2
+; x86_64-apple-darwin: @load_relative_1.table.rel = private unnamed_addr constant [3 x i32] [i32 trunc (i64 sub (i64 ptrtoint (ptr @a0 to i64), i64 ptrtoint (ptr @load_relative_1.table.rel to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr @a1 to i64), i64 ptrtoint (ptr @load_relative_1.table.rel to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr @a2 to i64), i64 ptrtoint (ptr @load_relative_1.table.rel to i64)) to i32)], align 4
+; x86_64-apple-darwin: @x0 = internal unnamed_addr constant i64 0
+; x86_64-apple-darwin: @x1 = internal unnamed_addr constant i64 1
+; x86_64-apple-darwin: @x2 = internal unnamed_addr constant i64 2
+; x86_64-apple-darwin: @x3 = internal unnamed_addr constant i64 3
+; x86_64-apple-darwin: @y0 = internal constant ptr @x3
+; x86_64-apple-darwin: @y1 = internal constant ptr @x2
+; x86_64-apple-darwin: @y2 = internal constant ptr @x1
+; x86_64-apple-darwin: @y3 = internal constant ptr @x0
+; x86_64-apple-darwin: @load_relative_2.table.rel = private unnamed_addr constant [4 x i32] [i32 trunc (i64 sub (i64 ptrtoint (ptr @y3 to i64), i64 ptrtoint (ptr @load_relative_2.table.rel to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr @y2 to i64), i64 ptrtoint (ptr @load_relative_2.table.rel to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr @y1 to i64), i64 ptrtoint (ptr @load_relative_2.table.rel to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr @y0 to i64), i64 ptrtoint (ptr @load_relative_2.table.rel to i64)) to i32)], align 4
+; x86_64-apple-darwin: @b0 = private constant [8 x i8] c"00000000"
+; x86_64-apple-darwin: @b1 = private constant [8 x i8] c"11111111"
+; x86_64-apple-darwin: @b2 = private constant [8 x i8] c"22222222"
+; x86_64-apple-darwin: @load_relative_3.table.rel = private unnamed_addr constant [3 x i32] [i32 trunc (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr @b0, i64 8) to i64), i64 ptrtoint (ptr @load_relative_3.table.rel to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr @b1, i64 8) to i64), i64 ptrtoint (ptr @load_relative_3.table.rel to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr @b2, i64 8) to i64), i64 ptrtoint (ptr @load_relative_3.table.rel to i64)) to i32)], align 4
+;.
+; aarch64: @a0 = private constant i32 0
+; aarch64: @a1 = private constant i32 1
+; aarch64: @a2 = private constant i32 2
+; aarch64: @load_relative_1.table.rel = private unnamed_addr constant [3 x i32] [i32 trunc (i64 sub (i64 ptrtoint (ptr @a0 to i64), i64 ptrtoint (ptr @load_relative_1.table.rel to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr @a1 to i64), i64 ptrtoint (ptr @load_relative_1.table.rel to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr @a2 to i64), i64 ptrtoint (ptr @load_relative_1.table.rel to i64)) to i32)], align 4
+; aarch64: @x0 = internal unnamed_addr constant i64 0
+; aarch64: @x1 = internal unnamed_addr constant i64 1
+; aarch64: @x2 = internal unnamed_addr constant i64 2
+; aarch64: @x3 = internal unnamed_addr constant i64 3
+; aarch64: @y0 = internal constant ptr @x3
+; aarch64: @y1 = internal constant ptr @x2
+; aarch64: @y2 = internal constant ptr @x1
+; aarch64: @y3 = internal constant ptr @x0
+; aarch64: @load_relative_2.table.rel = private unnamed_addr constant [4 x i32] [i32 trunc (i64 sub (i64 ptrtoint (ptr @y3 to i64), i64 ptrtoint (ptr @load_relative_2.table.rel to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr @y2 to i64), i64 ptrtoint (ptr @load_relative_2.table.rel to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr @y1 to i64), i64 ptrtoint (ptr @load_relative_2.table.rel to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr @y0 to i64), i64 ptrtoint (ptr @load_relative_2.table.rel to i64)) to i32)], align 4
+; aarch64: @b0 = private constant [8 x i8] c"00000000"
+; aarch64: @b1 = private constant [8 x i8] c"11111111"
+; aarch64: @b2 = private constant [8 x i8] c"22222222"
+; aarch64: @load_relative_3.table.rel = private unnamed_addr constant [3 x i32] [i32 trunc (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr @b0, i64 8) to i64), i64 ptrtoint (ptr @load_relative_3.table.rel to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr @b1, i64 8) to i64), i64 ptrtoint (ptr @load_relative_3.table.rel to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr @b2, i64 8) to i64), i64 ptrtoint (ptr @load_relative_3.table.rel to i64)) to i32)], align 4
+;.
+; x86_64: @a0 = private unnamed_addr constant i32 0
+; x86_64: @a1 = private unnamed_addr constant i32 1
+; x86_64: @a2 = private unnamed_addr constant i32 2
+; x86_64: @load_relative_1.table.rel = private unnamed_addr constant [3 x i32] [i32 trunc (i64 sub (i64 ptrtoint (ptr @a0 to i64), i64 ptrtoint (ptr @load_relative_1.table.rel to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr @a1 to i64), i64 ptrtoint (ptr @load_relative_1.table.rel to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr @a2 to i64), i64 ptrtoint (ptr @load_relative_1.table.rel to i64)) to i32)], align 4
+; x86_64: @x0 = internal unnamed_addr constant i64 0
+; x86_64: @x1 = internal unnamed_addr constant i64 1
+; x86_64: @x2 = internal unnamed_addr constant i64 2
+; x86_64: @x3 = internal unnamed_addr constant i64 3
+; x86_64: @y0 = internal unnamed_addr constant ptr @x3
+; x86_64: @y1 = internal unnamed_addr constant ptr @x2
+; x86_64: @y2 = internal unnamed_addr constant ptr @x1
+; x86_64: @y3 = internal unnamed_addr constant ptr @x0
+; x86_64: @load_relative_2.table.rel = private unnamed_addr constant [4 x i32] [i32 trunc (i64 sub (i64 ptrtoint (ptr @y3 to i64), i64 ptrtoint (ptr @load_relative_2.table.rel to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr @y2 to i64), i64 ptrtoint (ptr @load_relative_2.table.rel to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr @y1 to i64), i64 ptrtoint (ptr @load_relative_2.table.rel to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr @y0 to i64), i64 ptrtoint (ptr @load_relative_2.table.rel to i64)) to i32)], align 4
+; x86_64: @b0 = private unnamed_addr constant [8 x i8] c"00000000"
+; x86_64: @b1 = private unnamed_addr constant [8 x i8] c"11111111"
+; x86_64: @b2 = private unnamed_addr constant [8 x i8] c"22222222"
+; x86_64: @load_relative_3.table.rel = private unnamed_addr constant [3 x i32] [i32 trunc (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr @b0, i64 8) to i64), i64 ptrtoint (ptr @load_relative_3.table.rel to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr @b1, i64 8) to i64), i64 ptrtoint (ptr @load_relative_3.table.rel to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr @b2, i64 8) to i64), i64 ptrtoint (ptr @load_relative_3.table.rel to i64)) to i32)], align 4
+;.
+define ptr @load_relative_1(i64 %offset) {
+; x86_64-apple-darwin-LABEL: define ptr @load_relative_1(
+; x86_64-apple-darwin-SAME: i64 [[OFFSET:%.*]]) {
+; x86_64-apple-darwin-NEXT:    [[RELTABLE_SHIFT:%.*]] = shl i64 [[OFFSET]], 2
+; x86_64-apple-darwin-NEXT:    [[RELTABLE_INTRINSIC:%.*]] = call ptr @llvm.load.relative.i64(ptr @load_relative_1.table.rel, i64 [[RELTABLE_SHIFT]])
+; x86_64-apple-darwin-NEXT:    ret ptr [[RELTABLE_INTRINSIC]]
+;
+; aarch64-LABEL: define ptr @load_relative_1(
+; aarch64-SAME: i64 [[OFFSET:%.*]]) {
+; aarch64-NEXT:    [[RELTABLE_SHIFT:%.*]] = shl i64 [[OFFSET]], 2
+; aarch64-NEXT:    [[RELTABLE_INTRINSIC:%.*]] = call ptr @llvm.load.relative.i64(ptr @load_relative_1.table.rel, i64 [[RELTABLE_SHIFT]])
+; aarch64-NEXT:    ret ptr [[RELTABLE_INTRINSIC]]
+;
+; x86_64-LABEL: define ptr @load_relative_1(
+; x86_64-SAME: i64 [[OFFSET:%.*]]) {
+; x86_64-NEXT:    [[RELTABLE_SHIFT:%.*]] = shl i64 [[OFFSET]], 2
+; x86_64-NEXT:    [[RELTABLE_INTRINSIC:%.*]] = call ptr @llvm.load.relative.i64(ptr @load_relative_1.table.rel, i64 [[RELTABLE_SHIFT]])
+; x86_64-NEXT:    ret ptr [[RELTABLE_INTRINSIC]]
+;
+  %gep = getelementptr inbounds [3 x ptr], ptr @load_relative_1.table, i64 0, i64 %offset
+  %load = load ptr, ptr %gep
+  ret ptr %load
+}
+
+define ptr @load_relative_2(i64 %offset) {
+; x86_64-apple-darwin-LABEL: define ptr @load_relative_2(
+; x86_64-apple-darwin-SAME: i64 [[OFFSET:%.*]]) {
+; x86_64-apple-darwin-NEXT:    [[RELTABLE_SHIFT:%.*]] = shl i64 [[OFFSET]], 2
+; x86_64-apple-darwin-NEXT:    [[RELTABLE_INTRINSIC:%.*]] = call ptr @llvm.load.relative.i64(ptr @load_relative_2.table.rel, i64 [[RELTABLE_SHIFT]])
+; x86_64-apple-darwin-NEXT:    ret ptr [[RELTABLE_INTRINSIC]]
+;
+; aarch64-LABEL: define ptr @load_relative_2(
+; aarch64-SAME: i64 [[OFFSET:%.*]]) {
+; aarch64-NEXT:    [[RELTABLE_SHIFT:%.*]] = shl i64 [[OFFSET]], 2
+; aarch64-NEXT:    [[RELTABLE_INTRINSIC:%.*]] = call ptr @llvm.load.relative.i64(ptr @load_relative_2.table.rel, i64 [[RELTABLE_SHIFT]])
+; aarch64-NEXT:    ret ptr [[RELTABLE_INTRINSIC]]
+;
+; x86_64-LABEL: define ptr @load_relative_2(
+; x86_64-SAME: i64 [[OFFSET:%.*]]) {
+; x86_64-NEXT:    [[RELTABLE_SHIFT:%.*]] = shl i64 [[OFFSET]], 2
+; x86_64-NEXT:    [[RELTABLE_INTRINSIC:%.*]] = call ptr @llvm.load.relative.i64(ptr @load_relative_2.table.rel, i64 [[RELTABLE_SHIFT]])
+; x86_64-NEXT:    ret ptr [[RELTABLE_INTRINSIC]]
+;
+  %gep = getelementptr inbounds [4 x ptr], ptr @load_relative_2.table, i64 0, i64 %offset
+  %load = load ptr, ptr %gep
+  ret ptr %load
+}
+
+define ptr @load_relative_3(i64 %offset) {
+; x86_64-apple-darwin-LABEL: define ptr @load_relative_3(
+; x86_64-apple-darwin-SAME: i64 [[OFFSET:%.*]]) {
+; x86_64-apple-darwin-NEXT:    [[RELTABLE_SHIFT:%.*]] = shl i64 [[OFFSET]], 2
+; x86_64-apple-darwin-NEXT:    [[RELTABLE_INTRINSIC:%.*]] = call ptr @llvm.load.relative.i64(ptr @load_relative_3.table.rel, i64 [[RELTABLE_SHIFT]])
+; x86_64-apple-darwin-NEXT:    ret ptr [[RELTABLE_INTRINSIC]]
+;
+; aarch64-LABEL: define ptr @load_relative_3(
+; aarch64-SAME: i64 [[OFFSET:%.*]]) {
+; aarch64-NEXT:    [[RELTABLE_SHIFT:%.*]] = shl i64 [[OFFSET]], 2
+; aarch64-NEXT:    [[RELTABLE_INTRINSIC:%.*]] = call ptr @llvm.load.relative.i64(ptr @load_relative_3.table.rel, i64 [[RELTABLE_SHIFT]])
+; aarch64-NEXT:    ret ptr [[RELTABLE_INTRINSIC]]
+;
+; x86_64-LABEL: define ptr @load_relative_3(
+; x86_64-SAME: i64 [[OFFSET:%.*]]) {
+; x86_64-NEXT:    [[RELTABLE_SHIFT:%.*]] = shl i64 [[OFFSET]], 2
+; x86_64-NEXT:    [[RELTABLE_INTRINSIC:%.*]] = call ptr @llvm.load.relative.i64(ptr @load_relative_3.table.rel, i64 [[RELTABLE_SHIFT]])
+; x86_64-NEXT:    ret ptr [[RELTABLE_INTRINSIC]]
+;
+  %gep = getelementptr inbounds [3 x ptr], ptr @load_relative_3.table, i64 0, i64 %offset
+  %load = load ptr, ptr %gep
+  ret ptr %load
+}
+
+;.
+; x86_64-apple-darwin: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+;.
+; aarch64: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+;.
+; x86_64: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+;.
diff --git a/llvm/test/Verifier/sme-attributes.ll b/llvm/test/Verifier/sme-attributes.ll
index 4bf5e813daf2f..0ae2b9fd91f52 100644
--- a/llvm/test/Verifier/sme-attributes.ll
+++ b/llvm/test/Verifier/sme-attributes.ll
@@ -68,3 +68,6 @@ declare void @zt0_inout_out() "aarch64_inout_zt0" "aarch64_out_zt0";
 
 declare void @zt0_inout_agnostic() "aarch64_inout_zt0" "aarch64_za_state_agnostic";
 ; CHECK: Attributes 'aarch64_new_zt0', 'aarch64_in_zt0', 'aarch64_out_zt0', 'aarch64_inout_zt0', 'aarch64_preserves_zt0' and 'aarch64_za_state_agnostic' are mutually exclusive
+
+declare void @zt0_undef_function() "aarch64_zt0_undef";
+; CHECK: Attribute 'aarch64_zt0_undef' can only be applied to a callsite.
diff --git a/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test b/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test
index 2b2bd670613de..d6f6fe10d88c2 100644
--- a/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test
+++ b/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test
@@ -16,7 +16,11 @@
 # CHECK:       fileoff: 0
 
 # The YAML below is the following code
+# ```
+# static int foo = 12345;
+# int bar = 4567;
 # int main(int argc, char **argv) { return 0; }
+# ```
 # Compiled on macOS against the macOS SDK and passing `-Wl,-encryptable`
 # Contents are removed, since they are not important for the test. We need a
 # small text segment (smaller than a page).
@@ -26,8 +30,8 @@ FileHeader:
   cputype:         0x100000C
   cpusubtype:      0x0
   filetype:        0x2
-  ncmds:           15
-  sizeofcmds:      696
+  ncmds:           18
+  sizeofcmds:      920
   flags:           0x200085
   reserved:        0x0
 LoadCommands:
@@ -69,7 +73,7 @@ LoadCommands:
       - sectname:        __unwind_info
         segname:         __TEXT
         addr:            0x100004020
-        size:            4152
+        size:            88
         offset:          0x4020
         align:           2
         reloff:          0x0
@@ -79,37 +83,61 @@ LoadCommands:
         reserved2:       0x0
         reserved3:       0x0
   - cmd:             LC_SEGMENT_64
-    cmdsize:         72
-    segname:         __LINKEDIT
+    cmdsize:         152
+    segname:         __DATA
     vmaddr:          4295000064
-    vmsize:          592
+    vmsize:          16384
     fileoff:         32768
-    filesize:        592
+    filesize:        16384
+    maxprot:         3
+    initprot:        3
+    nsects:          1
+    flags:           0
+    Sections:
+      - sectname:        __data
+        segname:         __DATA
+        addr:            0x100008000
+        size:            4
+        offset:          0x8000
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          4295016448
+    vmsize:          16384
+    fileoff:         49152
+    filesize:        768
     maxprot:         1
     initprot:        1
     nsects:          0
     flags:           0
   - cmd:             LC_DYLD_CHAINED_FIXUPS
     cmdsize:         16
-    dataoff:         32768
-    datasize:        48
+    dataoff:         49152
+    datasize:        56
   - cmd:             LC_DYLD_EXPORTS_TRIE
     cmdsize:         16
-    dataoff:         32816
-    datasize:        48
+    dataoff:         49208
+    datasize:        64
   - cmd:             LC_SYMTAB
     cmdsize:         24
-    symoff:          32872
-    nsyms:           2
-    stroff:          32904
-    strsize:         32
+    symoff:          49280
+    nsyms:           3
+    stroff:          49328
+    strsize:         40
   - cmd:             LC_DYSYMTAB
     cmdsize:         80
     ilocalsym:       0
     nlocalsym:       0
     iextdefsym:      0
-    nextdefsym:      2
-    iundefsym:       2
+    nextdefsym:      3
+    iundefsym:       3
     nundefsym:       0
     tocoff:          0
     ntoc:            0
@@ -123,12 +151,6 @@ LoadCommands:
     nextrel:         0
     locreloff:       0
     nlocrel:         0
-  - cmd:             LC_ENCRYPTION_INFO_64
-    cmdsize:         24
-    cryptoff:        16384
-    cryptsize:       16384
-    cryptid:         0
-    pad:             0
   - cmd:             LC_LOAD_DYLINKER
     cmdsize:         32
     name:            12
@@ -136,32 +158,50 @@ LoadCommands:
     ZeroPadBytes:    7
   - cmd:             LC_UUID
     cmdsize:         24
-    uuid:            4C4C4447-5555-3144-A18A-01E9EB7E7D92
+    uuid:            ADDA943C-657A-3A49-9580-168E17A40FFB
   - cmd:             LC_BUILD_VERSION
     cmdsize:         32
     platform:        1
     minos:           983040
-    sdk:             983552
+    sdk:             984320
     ntools:          1
     Tools:
-      - tool:            4
-        version:         1310720
+      - tool:            3
+        version:         76481537
+  - cmd:             LC_SOURCE_VERSION
+    cmdsize:         16
+    version:         0
   - cmd:             LC_MAIN
     cmdsize:         24
     entryoff:        16384
     stacksize:       0
+  - cmd:             LC_ENCRYPTION_INFO_64
+    cmdsize:         24
+    cryptoff:        16384
+    cryptsize:       16384
+    cryptid:         0
+    pad:             0
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       2
+      current_version: 88539136
+      compatibility_version: 65536
+    Content:         '/usr/lib/libSystem.B.dylib'
+    ZeroPadBytes:    6
   - cmd:             LC_FUNCTION_STARTS
     cmdsize:         16
-    dataoff:         32864
+    dataoff:         49272
     datasize:        8
   - cmd:             LC_DATA_IN_CODE
     cmdsize:         16
-    dataoff:         32872
+    dataoff:         49280
     datasize:        0
   - cmd:             LC_CODE_SIGNATURE
     cmdsize:         16
-    dataoff:         32944
-    datasize:        416
+    dataoff:         49376
+    datasize:        544
 LinkEditData:
   ExportTrie:
     TerminalSize:    0
@@ -173,51 +213,67 @@ LinkEditData:
     ImportName:      ''
     Children:
       - TerminalSize:    0
-        NodeOffset:      5
+        NodeOffset:      25
         Name:            _
         Flags:           0x0
         Address:         0x0
         Other:           0x0
         ImportName:      ''
         Children:
+          - TerminalSize:    2
+            NodeOffset:      9
+            Name:            _mh_execute_header
+            Flags:           0x0
+            Address:         0x0
+            Other:           0x0
+            ImportName:      ''
           - TerminalSize:    4
-            NodeOffset:      33
-            Name:            main
+            NodeOffset:      13
+            Name:            bar
             Flags:           0x0
-            Address:         0x4000
+            Address:         0x8000
             Other:           0x0
             ImportName:      ''
-          - TerminalSize:    2
-            NodeOffset:      39
-            Name:            _mh_execute_header
+          - TerminalSize:    4
+            NodeOffset:      19
+            Name:            main
             Flags:           0x0
-            Address:         0x0
+            Address:         0x4000
             Other:           0x0
             ImportName:      ''
   NameList:
     - n_strx:          2
       n_type:          0xF
       n_sect:          1
+      n_desc:          16
+      n_value:         4294967296
+    - n_strx:          22
+      n_type:          0xF
+      n_sect:          3
       n_desc:          0
-      n_value:         4294983680
-    - n_strx:          8
+      n_value:         4295000064
+    - n_strx:          27
       n_type:          0xF
       n_sect:          1
-      n_desc:          16
-      n_value:         4294967296
+      n_desc:          0
+      n_value:         4294983680
   StringTable:
     - ' '
-    - _main
     - __mh_execute_header
+    - _bar
+    - _main
+    - ''
+    - ''
+    - ''
     - ''
     - ''
     - ''
     - ''
   FunctionStarts:  [ 0x4000 ]
-  ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x30, 0x0, 
-                     0x0, 0x0, 0x30, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
-                     0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
-                     0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
-                     0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ]
+  ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x34, 0x0,
+                     0x0, 0x0, 0x34, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                     0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                     0x0, 0x0, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                     0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                     0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ]
 ...
-
diff --git a/llvm/test/tools/llvm-rc/Inputs/tag-accelerators-ascii-alt.rc b/llvm/test/tools/llvm-rc/Inputs/tag-accelerators-ascii-alt.rc
deleted file mode 100644
index 363263bfe4cf2..0000000000000
--- a/llvm/test/tools/llvm-rc/Inputs/tag-accelerators-ascii-alt.rc
+++ /dev/null
@@ -1,4 +0,0 @@
-2 ACCELERATORS {
-  "A", 15, ASCII, ALT
-}
-
diff --git a/llvm/test/tools/llvm-rc/Inputs/tag-accelerators.rc b/llvm/test/tools/llvm-rc/Inputs/tag-accelerators.rc
index 90e7f926cc087..bcfc35bdeab68 100644
--- a/llvm/test/tools/llvm-rc/Inputs/tag-accelerators.rc
+++ b/llvm/test/tools/llvm-rc/Inputs/tag-accelerators.rc
@@ -110,5 +110,6 @@ LANGUAGE 5, 1
   "7", 71, VIRTKEY, NOINVERT, CONTROL, SHIFT, ALT
   "^j", 72, ASCII
   "^j", 73, ASCII, NOINVERT
+  "A", 15, ASCII, ALT
 }
 
diff --git a/llvm/test/tools/llvm-rc/tag-accelerators.test b/llvm/test/tools/llvm-rc/tag-accelerators.test
index 336727f617687..4f44aebc75011 100644
--- a/llvm/test/tools/llvm-rc/tag-accelerators.test
+++ b/llvm/test/tools/llvm-rc/tag-accelerators.test
@@ -37,7 +37,7 @@
 ; ACCELERATORS-NEXT: Version (major): 0
 ; ACCELERATORS-NEXT: Version (minor): 0
 ; ACCELERATORS-NEXT: Characteristics: 0
-; ACCELERATORS-NEXT: Data size: 592
+; ACCELERATORS-NEXT: Data size: 600
 ; ACCELERATORS-NEXT: Data: (
 ; ACCELERATORS-NEXT:   0000: 00002A00 00000000 01002A00 01000000  |..*.......*.....|
 ; ACCELERATORS-NEXT:   0010: 02002A00 02000000 03002A00 03000000  |..*.......*.....|
@@ -75,7 +75,8 @@
 ; ACCELERATORS-NEXT:   0210: 15003700 42000000 0F003700 43000000  |..7.B.....7.C...|
 ; ACCELERATORS-NEXT:   0220: 1B003700 44000000 17003700 45000000  |..7.D.....7.E...|
 ; ACCELERATORS-NEXT:   0230: 1D003700 46000000 1F003700 47000000  |..7.F.....7.G...|
-; ACCELERATORS-NEXT:   0240: 00000A00 48000000 82000A00 49000000  |....H.......I...|
+; ACCELERATORS-NEXT:   0240: 00000A00 48000000 02000A00 49000000  |....H.......I...|
+; ACCELERATORS-NEXT:   0250: 90004100 0F000000                    |..A.....|
 ; ACCELERATORS-NEXT: )
 
 
@@ -94,19 +95,13 @@
 ; RUN: not llvm-rc -no-preprocess /FO %t -- %p/Inputs/tag-accelerators-ascii-control.rc 2>&1 | FileCheck %s --check-prefix ASCII2
 
 ; ASCII2: llvm-rc: Error in ACCELERATORS statement (ID 2):
-; ASCII2-NEXT: Accelerator ID 15: Can only apply ALT, SHIFT or CONTROL to VIRTKEY accelerators
+; ASCII2-NEXT: Accelerator ID 15: Can only apply SHIFT or CONTROL to VIRTKEY accelerators
 
 
 ; RUN: not llvm-rc -no-preprocess /FO %t -- %p/Inputs/tag-accelerators-ascii-shift.rc 2>&1 | FileCheck %s --check-prefix ASCII3
 
 ; ASCII3: llvm-rc: Error in ACCELERATORS statement (ID 2):
-; ASCII3-NEXT: Accelerator ID 15: Can only apply ALT, SHIFT or CONTROL to VIRTKEY accelerators
-
-
-; RUN: not llvm-rc -no-preprocess /FO %t -- %p/Inputs/tag-accelerators-ascii-alt.rc 2>&1 | FileCheck %s --check-prefix ASCII4
-
-; ASCII4: llvm-rc: Error in ACCELERATORS statement (ID 2):
-; ASCII4-NEXT: Accelerator ID 15: Can only apply ALT, SHIFT or CONTROL to VIRTKEY accelerators
+; ASCII3-NEXT: Accelerator ID 15: Can only apply SHIFT or CONTROL to VIRTKEY accelerators
 
 
 ; RUN: not llvm-rc -no-preprocess /FO %t -- %p/Inputs/tag-accelerators-bad-key-id.rc 2>&1 | FileCheck %s --check-prefix BADKEYID
diff --git a/llvm/tools/llvm-rc/ResourceFileWriter.cpp b/llvm/tools/llvm-rc/ResourceFileWriter.cpp
index 85b59532bb83b..35c0768e33322 100644
--- a/llvm/tools/llvm-rc/ResourceFileWriter.cpp
+++ b/llvm/tools/llvm-rc/ResourceFileWriter.cpp
@@ -631,8 +631,8 @@ Error ResourceFileWriter::writeSingleAccelerator(
   if (IsASCII && IsVirtKey)
     return createAccError("Accelerator can't be both ASCII and VIRTKEY");
 
-  if (!IsVirtKey && (Obj.Flags & (Opt::ALT | Opt::SHIFT | Opt::CONTROL)))
-    return createAccError("Can only apply ALT, SHIFT or CONTROL to VIRTKEY"
+  if (!IsVirtKey && (Obj.Flags & (Opt::SHIFT | Opt::CONTROL)))
+    return createAccError("Can only apply SHIFT or CONTROL to VIRTKEY"
                           " accelerators");
 
   if (Obj.Event.isInt()) {
diff --git a/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp b/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp
index 3af5e24168c8c..f8c77fcba19cf 100644
--- a/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp
+++ b/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp
@@ -1,6 +1,7 @@
 #include "Utils/AArch64SMEAttributes.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/SourceMgr.h"
 
@@ -69,6 +70,15 @@ TEST(SMEAttributes, Constructors) {
   ASSERT_TRUE(SA(*parseIR("declare void @foo() \"aarch64_new_zt0\"")
                       ->getFunction("foo"))
                   .isNewZT0());
+  ASSERT_TRUE(
+      SA(cast<CallBase>((parseIR("declare void @callee()\n"
+                                 "define void @foo() {"
+                                 "call void @callee() \"aarch64_zt0_undef\"\n"
+                                 "ret void\n}")
+                             ->getFunction("foo")
+                             ->begin()
+                             ->front())))
+          .isUndefZT0());
 
   // Invalid combinations.
   EXPECT_DEBUG_DEATH(SA(SA::SM_Enabled | SA::SM_Compatible),
@@ -215,6 +225,18 @@ TEST(SMEAttributes, Basics) {
   ASSERT_FALSE(ZT0_New.hasSharedZAInterface());
   ASSERT_TRUE(ZT0_New.hasPrivateZAInterface());
 
+  SA ZT0_Undef = SA(SA::ZT0_Undef | SA::encodeZT0State(SA::StateValue::New));
+  ASSERT_TRUE(ZT0_Undef.isNewZT0());
+  ASSERT_FALSE(ZT0_Undef.isInZT0());
+  ASSERT_FALSE(ZT0_Undef.isOutZT0());
+  ASSERT_FALSE(ZT0_Undef.isInOutZT0());
+  ASSERT_FALSE(ZT0_Undef.isPreservesZT0());
+  ASSERT_FALSE(ZT0_Undef.sharesZT0());
+  ASSERT_TRUE(ZT0_Undef.hasZT0State());
+  ASSERT_FALSE(ZT0_Undef.hasSharedZAInterface());
+  ASSERT_TRUE(ZT0_Undef.hasPrivateZAInterface());
+  ASSERT_TRUE(ZT0_Undef.isUndefZT0());
+
   ASSERT_FALSE(SA(SA::Normal).isInZT0());
   ASSERT_FALSE(SA(SA::Normal).isOutZT0());
   ASSERT_FALSE(SA(SA::Normal).isInOutZT0());
@@ -285,6 +307,7 @@ TEST(SMEAttributes, Transitions) {
   SA ZT0_Shared = SA(SA::encodeZT0State(SA::StateValue::In));
   SA ZA_ZT0_Shared = SA(SA::encodeZAState(SA::StateValue::In) |
                         SA::encodeZT0State(SA::StateValue::In));
+  SA Undef_ZT0 = SA(SA::ZT0_Undef);
 
   // Shared ZA -> Private ZA Interface
   ASSERT_FALSE(ZA_Shared.requiresDisablingZABeforeCall(Private_ZA));
@@ -295,6 +318,13 @@ TEST(SMEAttributes, Transitions) {
   ASSERT_TRUE(ZT0_Shared.requiresPreservingZT0(Private_ZA));
   ASSERT_TRUE(ZT0_Shared.requiresEnablingZAAfterCall(Private_ZA));
 
+  // Shared Undef ZT0 -> Private ZA Interface
+  // Note: "Undef ZT0" is a callsite attribute that means ZT0 is undefined at
+  // point the of the call.
+  ASSERT_TRUE(ZT0_Shared.requiresDisablingZABeforeCall(Undef_ZT0));
+  ASSERT_FALSE(ZT0_Shared.requiresPreservingZT0(Undef_ZT0));
+  ASSERT_TRUE(ZT0_Shared.requiresEnablingZAAfterCall(Undef_ZT0));
+
   // Shared ZA & ZT0 -> Private ZA Interface
   ASSERT_FALSE(ZA_ZT0_Shared.requiresDisablingZABeforeCall(Private_ZA));
   ASSERT_TRUE(ZA_ZT0_Shared.requiresPreservingZT0(Private_ZA));
diff --git a/llvm/unittests/TargetParser/Host.cpp b/llvm/unittests/TargetParser/Host.cpp
index c5b96e1df904e..2a3958151a604 100644
--- a/llvm/unittests/TargetParser/Host.cpp
+++ b/llvm/unittests/TargetParser/Host.cpp
@@ -340,7 +340,7 @@ TEST(getLinuxHostCPUName, s390x) {
 
   // Model Id: 9175
   ExpectedCPUs.push_back("zEC12");
-  ExpectedCPUs.push_back("arch15");
+  ExpectedCPUs.push_back("z17");
 
   // Model Id: 3931
   ExpectedCPUs.push_back("zEC12");
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index 7ebfcf915a7c5..5089bc0fd479a 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -507,6 +507,14 @@ TEST(ParseArchString, RejectsDoubleOrTrailingUnderscore) {
 }
 
 TEST(ParseArchString, RejectsDuplicateExtensionNames) {
+  // Zicsr/Zifencei are allowed to duplicate with "g".
+  ASSERT_THAT_EXPECTED(RISCVISAInfo::parseArchString("rv64g_zicsr", true),
+                       Succeeded());
+  ASSERT_THAT_EXPECTED(RISCVISAInfo::parseArchString("rv64g_zifencei", true),
+                       Succeeded());
+  ASSERT_THAT_EXPECTED(
+      RISCVISAInfo::parseArchString("rv64g_zicsr_zifencei", true), Succeeded());
+
   EXPECT_EQ(toString(RISCVISAInfo::parseArchString("rv64ii", true).takeError()),
             "invalid standard user-level extension 'i'");
   EXPECT_EQ(toString(RISCVISAInfo::parseArchString("rv32ee", true).takeError()),
diff --git a/offload/DeviceRTL/include/Synchronization.h b/offload/DeviceRTL/include/Synchronization.h
index 5a789441b9d35..c510fbf0774c2 100644
--- a/offload/DeviceRTL/include/Synchronization.h
+++ b/offload/DeviceRTL/include/Synchronization.h
@@ -61,7 +61,11 @@ V add(Ty *Address, V Val, atomic::OrderingTy Ordering,
 template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
 V load(Ty *Address, atomic::OrderingTy Ordering,
        MemScopeTy MemScope = MemScopeTy::device) {
+#ifdef __NVPTX__
+  return __scoped_atomic_fetch_add(Address, V(0), Ordering, MemScope);
+#else
   return __scoped_atomic_load_n(Address, Ordering, MemScope);
+#endif
 }
 
 template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
diff --git a/third-party/unittest/googletest/src/gtest.cc b/third-party/unittest/googletest/src/gtest.cc
index 30a5cc3f83a7e..37d380a789831 100644
--- a/third-party/unittest/googletest/src/gtest.cc
+++ b/third-party/unittest/googletest/src/gtest.cc
@@ -43,6 +43,7 @@
 #include <algorithm>
 #include <chrono>  // NOLINT
 #include <cmath>
+#include <csignal>
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>